Skip to content

Commit

Permalink
Merge pull request #9864 from sbueringer/pr-node-inspection-failed
Browse files Browse the repository at this point in the history
🌱 Mark Machine healthy condition as unknown if we can't list wl nodes
  • Loading branch information
k8s-ci-robot authored Jan 12, 2024
2 parents 07cd79e + ee307a7 commit 39f8c4b
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 12 deletions.
5 changes: 5 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,11 @@ const (

// NodeConditionsFailedReason (Severity=Warning) documents a node is not in a healthy state due to the failed state of at least 1 Kubelet condition.
NodeConditionsFailedReason = "NodeConditionsFailed"

// NodeInspectionFailedReason documents a failure in inspecting the node.
// This reason is used when the Machine controller is unable to list Nodes to find
// the corresponding Node for a Machine by ProviderID.
NodeInspectionFailedReason = "NodeInspectionFailed"
)

// Conditions and condition Reasons for the MachineHealthCheck object.
Expand Down
2 changes: 1 addition & 1 deletion internal/controllers/machine/machine_controller_noderef.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
// No need to requeue here. Nodes emit an event that triggers reconciliation.
return ctrl.Result{}, nil
}
log.Error(err, "Failed to retrieve Node by ProviderID")
r.recorder.Event(machine, corev1.EventTypeWarning, "Failed to retrieve Node by ProviderID", err.Error())
conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID")
return ctrl.Result{}, err
}

Expand Down
47 changes: 36 additions & 11 deletions internal/controllers/machine/machine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,9 @@ func TestMachineConditions(t *testing.T) {
infraReady bool
bootstrapReady bool
beforeFunc func(bootstrap, infra *unstructured.Unstructured, m *clusterv1.Machine)
additionalObjects []client.Object
conditionsToAssert []*clusterv1.Condition
wantErr bool
}{
{
name: "all conditions true",
Expand Down Expand Up @@ -952,6 +954,26 @@ func TestMachineConditions(t *testing.T) {
conditions.FalseCondition(clusterv1.ReadyCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, ""),
},
},
{
name: "machine ready and MachineNodeHealthy unknown",
infraReady: true,
bootstrapReady: true,
additionalObjects: []client.Object{&corev1.Node{
// This is a duplicate node with the same providerID
// This should lead to an error when trying to get the Node for a Machine.
ObjectMeta: metav1.ObjectMeta{
Name: "test-duplicate",
},
Spec: corev1.NodeSpec{ProviderID: "test://id-1"},
}},
wantErr: true,
conditionsToAssert: []*clusterv1.Condition{
conditions.TrueCondition(clusterv1.InfrastructureReadyCondition),
conditions.TrueCondition(clusterv1.BootstrapReadyCondition),
conditions.TrueCondition(clusterv1.ReadyCondition),
conditions.UnknownCondition(clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID"),
},
},
}

for _, tt := range testcases {
Expand All @@ -966,28 +988,31 @@ func TestMachineConditions(t *testing.T) {
tt.beforeFunc(bootstrap, infra, m)
}

clientFake := fake.NewClientBuilder().WithObjects(
&testCluster,
m,
builder.GenericInfrastructureMachineCRD.DeepCopy(),
infra,
builder.GenericBootstrapConfigCRD.DeepCopy(),
bootstrap,
node,
).
objs := []client.Object{&testCluster, m, node,
builder.GenericInfrastructureMachineCRD.DeepCopy(), infra,
builder.GenericBootstrapConfigCRD.DeepCopy(), bootstrap,
}
objs = append(objs, tt.additionalObjects...)

clientFake := fake.NewClientBuilder().WithObjects(objs...).
WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID).
WithStatusSubresource(&clusterv1.Machine{}).
Build()

r := &Reconciler{
Client: clientFake,
UnstructuredCachingClient: clientFake,
recorder: record.NewFakeRecorder(10),
Tracker: remote.NewTestClusterCacheTracker(logr.New(log.NullLogSink{}), clientFake, scheme.Scheme, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}),
ssaCache: ssa.NewCache(),
}

_, err := r.Reconcile(ctx, reconcile.Request{NamespacedName: util.ObjectKey(&machine)})
g.Expect(err).ToNot(HaveOccurred())
if tt.wantErr {
g.Expect(err).To(HaveOccurred())
} else {
g.Expect(err).ToNot(HaveOccurred())
}

m = &clusterv1.Machine{}
g.Expect(r.Client.Get(ctx, client.ObjectKeyFromObject(&machine), m)).ToNot(HaveOccurred())
Expand Down Expand Up @@ -2233,7 +2258,7 @@ func assertCondition(t *testing.T, from conditions.Getter, condition *clusterv1.
g.Expect(conditions.Has(from, condition.Type)).To(BeTrue())

if condition.Status == corev1.ConditionTrue {
conditions.IsTrue(from, condition.Type)
g.Expect(conditions.IsTrue(from, condition.Type)).To(BeTrue())
} else {
conditionToBeAsserted := conditions.Get(from, condition.Type)
g.Expect(conditionToBeAsserted.Status).To(Equal(condition.Status))
Expand Down

0 comments on commit 39f8c4b

Please sign in to comment.