From c5881e2e298c2fe4e9cd2074a024d9c4ad35ff7f Mon Sep 17 00:00:00 2001 From: huangwei Date: Sat, 3 Feb 2024 22:40:48 +0800 Subject: [PATCH 01/14] Prioritize deletion of abnormal outdated CP machines when scaling down KCP --- .../kubeadm/internal/control_plane.go | 6 ++ .../internal/controllers/remediation_test.go | 2 + .../kubeadm/internal/controllers/scale.go | 2 + .../internal/controllers/scale_test.go | 57 ++++++++++++++++--- .../20191017-kubeadm-based-control-plane.md | 1 + util/collections/machine_filters.go | 32 +++++++++++ 6 files changed, 92 insertions(+), 8 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index ace8ff12ee53..7f875c38b173 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -314,3 +314,9 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC c.managementCluster = managementCluster c.workloadCluster = nil } + +// ControlPlaneMachinesWithUnhealthyCondition Returns unhealthy control plane machines. Unlike the UnhealthyMachines function, +// it checks if all health conditions on the control plane machines are true, not just marked by MHC. +func (c *ControlPlane) ControlPlaneMachinesWithUnhealthyCondition(machines collections.Machines) collections.Machines { + return machines.Filter(collections.HasUnhealthyControlPlaneMachineCondition(c.IsEtcdManaged())) +} diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index bc0b173b2776..6c72982b710d 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -1877,6 +1877,8 @@ func withHealthyEtcdMember() machineOption { func withUnhealthyEtcdMember() machineOption { return func(machine *clusterv1.Machine) { + newConditions := machine.Status.Conditions.DeepCopy() + machine.Status.Conditions = newConditions conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "") } } diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index f1b0e659c873..6a992903d36d 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -230,6 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: machines = controlPlane.MachineWithDeleteAnnotation(machines) + case controlPlane.ControlPlaneMachinesWithUnhealthyCondition(outdatedMachines).Len() > 0: + machines = controlPlane.ControlPlaneMachinesWithUnhealthyCondition(outdatedMachines) case outdatedMachines.Len() > 0: machines = outdatedMachines } diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index b2f3267a6061..4268a39c39fb 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -368,14 +368,35 @@ func TestSelectMachineForScaleDown(t *testing.T) { Spec: controlplanev1.KubeadmControlPlaneSpec{}, } startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC) - m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour))) - m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour))) - m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour))) - m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour))) - m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour))) - m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour))) - m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) - m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) + + controlPlaneHealthyConditions := []clusterv1.Condition{ + *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), + } + + m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), + machineOpt(withNodeRef("machine-1")), withConditions(controlPlaneHealthyConditions)) + m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), + machineOpt(withNodeRef("machine-2")), withConditions(controlPlaneHealthyConditions)) + m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), + machineOpt(withNodeRef("machine-3")), withConditions(controlPlaneHealthyConditions)) + m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), + machineOpt(withNodeRef("machine-4")), withConditions(controlPlaneHealthyConditions)) + m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), + machineOpt(withNodeRef("machine-5")), withConditions(controlPlaneHealthyConditions)) + m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), + machineOpt(withNodeRef("machine-6")), withConditions(controlPlaneHealthyConditions)) + m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"), + machineOpt(withNodeRef("machine-7")), withConditions(controlPlaneHealthyConditions)) + m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"), + machineOpt(withNodeRef("machine-8")), withConditions(controlPlaneHealthyConditions)) + m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), + machineOpt(withNodeRef("machine-9")), withConditions(controlPlaneHealthyConditions), machineOpt(withUnhealthyEtcdMember())) + m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)), + machineOpt(withNodeRef("machine-10")), withConditions(controlPlaneHealthyConditions), machineOpt(withUnhealthyEtcdMember())) mc3 := collections.FromMachines(m1, m2, m3, m4, m5) mc6 := collections.FromMachines(m6, m7, m8) @@ -458,6 +479,20 @@ func TestSelectMachineForScaleDown(t *testing.T) { expectErr: false, expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-8"}}, }, + { + name: "when there are machines needing upgrade, it returns the unhealthy machine in the largest failure domain which needing upgrade", + cp: upToDateControlPlane, + outDatedMachines: collections.FromMachines(m6, m9), + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-9"}}, + }, + { + name: "when there are machines needing upgrade, it returns the oldest unhealthy machine in the largest failure domain which needing upgrade", + cp: upToDateControlPlane, + outDatedMachines: collections.FromMachines(m6, m9, m10), + expectErr: false, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}}, + }, } for _, tc := range testCases { @@ -670,3 +705,9 @@ func withTimestamp(t time.Time) machineOpt { m.CreationTimestamp = metav1.NewTime(t) } } + +func withConditions(conditions clusterv1.Conditions) machineOpt { + return func(m *clusterv1.Machine) { + m.Status.Conditions = conditions + } +} diff --git a/docs/proposals/20191017-kubeadm-based-control-plane.md b/docs/proposals/20191017-kubeadm-based-control-plane.md index a057fb82be20..c23b740e5cc7 100644 --- a/docs/proposals/20191017-kubeadm-based-control-plane.md +++ b/docs/proposals/20191017-kubeadm-based-control-plane.md @@ -403,6 +403,7 @@ spec: - Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down: - outdatedMachines with the delete annotation - machines with the delete annotation + - outdatedMachines with unhealthy conditions - outdated machines - all machines diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 775e9a8b8ee8..2412527e461d 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -274,3 +274,35 @@ func HasNode() Func { return machine.Status.NodeRef != nil } } + +// HasUnhealthyControlPlaneMachineCondition returns a filter to find all unhealthy control plane machines, +// unlike the HasUnhealthyCondition func, it checks whether all health conditions on the control plane machine are true, +// including 'APIServerPodHealthy', 'ControllerManagerPodHealthy', 'SchedulerPodHealthy' and kubernetes node exist. +// If etcd managed, additional checks for 'EtcdPodHealthy', 'EtcdMemberHealthy' conditions. +func HasUnhealthyControlPlaneMachineCondition(isEtcdManaged bool) Func { + controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ + controlplanev1.MachineAPIServerPodHealthyCondition, + controlplanev1.MachineControllerManagerPodHealthyCondition, + controlplanev1.MachineSchedulerPodHealthyCondition, + } + if isEtcdManaged { + controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions, + controlplanev1.MachineEtcdPodHealthyCondition, + controlplanev1.MachineEtcdMemberHealthyCondition, + ) + } + + return func(machine *clusterv1.Machine) bool { + if !HasNode()(machine) { + return true + } + + for _, condition := range controlPlaneMachineHealthConditions { + if conditions.IsFalse(machine, condition) { + return true + } + } + + return false + } +} From 1f06b737c0f662e4ed11212e4368ff4bc0e03417 Mon Sep 17 00:00:00 2001 From: Levi080513 Date: Tue, 27 Feb 2024 10:31:20 +0800 Subject: [PATCH 02/14] Apply suggestions from code review Co-authored-by: Lubomir I. Ivanov --- controlplane/kubeadm/internal/control_plane.go | 2 +- util/collections/machine_filters.go | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 7f875c38b173..7fa4a1aaca47 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -315,7 +315,7 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC c.workloadCluster = nil } -// ControlPlaneMachinesWithUnhealthyCondition Returns unhealthy control plane machines. Unlike the UnhealthyMachines function, +// ControlPlaneMachinesWithUnhealthyCondition returns all unhealthy control plane machines. Unlike the UnhealthyMachines function, // it checks if all health conditions on the control plane machines are true, not just marked by MHC. func (c *ControlPlane) ControlPlaneMachinesWithUnhealthyCondition(machines collections.Machines) collections.Machines { return machines.Filter(collections.HasUnhealthyControlPlaneMachineCondition(c.IsEtcdManaged())) diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 2412527e461d..c68ed2679cd1 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -277,8 +277,8 @@ func HasNode() Func { // HasUnhealthyControlPlaneMachineCondition returns a filter to find all unhealthy control plane machines, // unlike the HasUnhealthyCondition func, it checks whether all health conditions on the control plane machine are true, -// including 'APIServerPodHealthy', 'ControllerManagerPodHealthy', 'SchedulerPodHealthy' and kubernetes node exist. -// If etcd managed, additional checks for 'EtcdPodHealthy', 'EtcdMemberHealthy' conditions. +// including 'APIServerPodHealthy', 'ControllerManagerPodHealthy', 'SchedulerPodHealthy' and Kubernetes node exist. +// If etcd is managed, add additional checks for 'EtcdPodHealthy', 'EtcdMemberHealthy' conditions. func HasUnhealthyControlPlaneMachineCondition(isEtcdManaged bool) Func { controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ controlplanev1.MachineAPIServerPodHealthyCondition, @@ -291,18 +291,15 @@ func HasUnhealthyControlPlaneMachineCondition(isEtcdManaged bool) Func { controlplanev1.MachineEtcdMemberHealthyCondition, ) } - return func(machine *clusterv1.Machine) bool { if !HasNode()(machine) { return true } - for _, condition := range controlPlaneMachineHealthConditions { if conditions.IsFalse(machine, condition) { return true } } - return false } } From 876e95cb03b8ce091173daf386ade9954b4834dc Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Tue, 27 Feb 2024 20:45:19 +0800 Subject: [PATCH 03/14] refine function comments and test code --- .../kubeadm/internal/control_plane.go | 13 ++-- .../internal/controllers/remediation_test.go | 8 +++ .../kubeadm/internal/controllers/scale.go | 4 +- .../internal/controllers/scale_test.go | 70 ++++++++----------- util/collections/machine_filters.go | 58 +++++++-------- 5 files changed, 74 insertions(+), 79 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 7fa4a1aaca47..773792a8b5da 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -238,6 +238,13 @@ func (c *ControlPlane) IsEtcdManaged() bool { return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil } +// UnhealthyMachinesWithNonMHCUnhealthyCondition returns all unhealthy control plane machines that +// does not have a Kubernetes node or have any control plane component condition set to False. +// It is different from the UnhealthyMachines func which checks MachineHealthCheck conditions. +func (c *ControlPlane) UnhealthyMachinesWithNonMHCUnhealthyCondition(machines collections.Machines) collections.Machines { + return machines.Filter(collections.HasUnhealthyControlPlaneComponentCondition(c.IsEtcdManaged())) +} + // UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC. func (c *ControlPlane) UnhealthyMachines() collections.Machines { return c.Machines.Filter(collections.HasUnhealthyCondition) @@ -314,9 +321,3 @@ func (c *ControlPlane) InjectTestManagementCluster(managementCluster ManagementC c.managementCluster = managementCluster c.workloadCluster = nil } - -// ControlPlaneMachinesWithUnhealthyCondition returns all unhealthy control plane machines. Unlike the UnhealthyMachines function, -// it checks if all health conditions on the control plane machines are true, not just marked by MHC. -func (c *ControlPlane) ControlPlaneMachinesWithUnhealthyCondition(machines collections.Machines) collections.Machines { - return machines.Filter(collections.HasUnhealthyControlPlaneMachineCondition(c.IsEtcdManaged())) -} diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index 6c72982b710d..416ca604231d 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -1883,6 +1883,14 @@ func withUnhealthyEtcdMember() machineOption { } } +func withUnhealthyAPIServerPod() machineOption { + return func(machine *clusterv1.Machine) { + newConditions := machine.Status.Conditions.DeepCopy() + machine.Status.Conditions = newConditions + conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "") + } +} + func withNodeRef(ref string) machineOption { return func(machine *clusterv1.Machine) { machine.Status.NodeRef = &corev1.ObjectReference{ diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 6a992903d36d..8d4f43e64f9c 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -230,8 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: machines = controlPlane.MachineWithDeleteAnnotation(machines) - case controlPlane.ControlPlaneMachinesWithUnhealthyCondition(outdatedMachines).Len() > 0: - machines = controlPlane.ControlPlaneMachinesWithUnhealthyCondition(outdatedMachines) + case controlPlane.UnhealthyMachinesWithNonMHCUnhealthyCondition(outdatedMachines).Len() > 0: + machines = controlPlane.UnhealthyMachinesWithNonMHCUnhealthyCondition(outdatedMachines) case outdatedMachines.Len() > 0: machines = outdatedMachines } diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index 4268a39c39fb..f1ea06ea6ccb 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -369,37 +369,24 @@ func TestSelectMachineForScaleDown(t *testing.T) { } startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC) - controlPlaneHealthyConditions := []clusterv1.Condition{ - *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), - *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), - } - - m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), - machineOpt(withNodeRef("machine-1")), withConditions(controlPlaneHealthyConditions)) - m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), - machineOpt(withNodeRef("machine-2")), withConditions(controlPlaneHealthyConditions)) - m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), - machineOpt(withNodeRef("machine-3")), withConditions(controlPlaneHealthyConditions)) - m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), - machineOpt(withNodeRef("machine-4")), withConditions(controlPlaneHealthyConditions)) - m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), - machineOpt(withNodeRef("machine-5")), withConditions(controlPlaneHealthyConditions)) - m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), - machineOpt(withNodeRef("machine-6")), withConditions(controlPlaneHealthyConditions)) - m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"), - machineOpt(withNodeRef("machine-7")), withConditions(controlPlaneHealthyConditions)) - m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"), - machineOpt(withNodeRef("machine-8")), withConditions(controlPlaneHealthyConditions)) - m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), - machineOpt(withNodeRef("machine-9")), withConditions(controlPlaneHealthyConditions), machineOpt(withUnhealthyEtcdMember())) - m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)), - machineOpt(withNodeRef("machine-10")), withConditions(controlPlaneHealthyConditions), machineOpt(withUnhealthyEtcdMember())) + m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour))) + m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour))) + m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour))) + m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour))) + m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour))) + m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour))) + m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) + m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) + m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), + machineOpt(withNodeRef("machine-9"))) + m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-4*time.Hour)), + machineOpt(withNodeRef("machine-10")), machineOpt(withUnhealthyAPIServerPod())) + m11 := machine("machine-11", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)), + machineOpt(withNodeRef("machine-11")), machineOpt(withUnhealthyEtcdMember())) mc3 := collections.FromMachines(m1, m2, m3, m4, m5) mc6 := collections.FromMachines(m6, m7, m8) + mc9 := collections.FromMachines(m9, m10, m11) fd := clusterv1.FailureDomains{ "one": failureDomain(true), "two": failureDomain(true), @@ -410,6 +397,11 @@ func TestSelectMachineForScaleDown(t *testing.T) { Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, Machines: mc3, } + needsUpgradeControlPlane1 := &internal.ControlPlane{ + KCP: &kcp, + Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, + Machines: mc9, + } upToDateControlPlane := &internal.ControlPlane{ KCP: &kcp, Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}}, @@ -473,23 +465,23 @@ func TestSelectMachineForScaleDown(t *testing.T) { expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-7"}}, }, { - name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotatio that still exist, it returns oldest marked machine first", + name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotation that still exist, it returns oldest marked machine first", cp: upToDateControlPlane, outDatedMachines: collections.FromMachines(m5, m3, m8, m7, m6, m1, m2), expectErr: false, expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-8"}}, }, { - name: "when there are machines needing upgrade, it returns the unhealthy machine in the largest failure domain which needing upgrade", - cp: upToDateControlPlane, - outDatedMachines: collections.FromMachines(m6, m9), + name: "when there are machines needing upgrade, it returns the single unhealthy machine with MachineAPIServerPodHealthyCondition set to False", + cp: needsUpgradeControlPlane1, + outDatedMachines: collections.FromMachines(m9, m10), expectErr: false, - expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-9"}}, + expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}}, }, { - name: "when there are machines needing upgrade, it returns the oldest unhealthy machine in the largest failure domain which needing upgrade", - cp: upToDateControlPlane, - outDatedMachines: collections.FromMachines(m6, m9, m10), + name: "when there are machines needing upgrade, it returns the oldest unhealthy machine with MachineEtcdMemberHealthyCondition set to False", + cp: needsUpgradeControlPlane1, + outDatedMachines: collections.FromMachines(m9, m10, m11), expectErr: false, expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}}, }, @@ -705,9 +697,3 @@ func withTimestamp(t time.Time) machineOpt { m.CreationTimestamp = metav1.NewTime(t) } } - -func withConditions(conditions clusterv1.Conditions) machineOpt { - return func(m *clusterv1.Machine) { - m.Status.Conditions = conditions - } -} diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index c68ed2679cd1..13215a5db166 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -158,6 +158,35 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool { return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) } +// HasUnhealthyControlPlaneComponentCondition returns a filter to find all unhealthy control plane machines that +// does not have a Kubernetes node or have any of the follwing control plane component conditions set to False: +// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy(if using managed etcd). +// It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions. +func HasUnhealthyControlPlaneComponentCondition(isEtcdManaged bool) Func { + controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ + controlplanev1.MachineAPIServerPodHealthyCondition, + controlplanev1.MachineControllerManagerPodHealthyCondition, + controlplanev1.MachineSchedulerPodHealthyCondition, + } + if isEtcdManaged { + controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions, + controlplanev1.MachineEtcdPodHealthyCondition, + controlplanev1.MachineEtcdMemberHealthyCondition, + ) + } + return func(machine *clusterv1.Machine) bool { + if !HasNode()(machine) { + return true + } + for _, condition := range controlPlaneMachineHealthConditions { + if conditions.IsFalse(machine, condition) { + return true + } + } + return false + } +} + // IsReady returns a filter to find all machines with the ReadyCondition equals to True. func IsReady() Func { return func(machine *clusterv1.Machine) bool { @@ -274,32 +303,3 @@ func HasNode() Func { return machine.Status.NodeRef != nil } } - -// HasUnhealthyControlPlaneMachineCondition returns a filter to find all unhealthy control plane machines, -// unlike the HasUnhealthyCondition func, it checks whether all health conditions on the control plane machine are true, -// including 'APIServerPodHealthy', 'ControllerManagerPodHealthy', 'SchedulerPodHealthy' and Kubernetes node exist. -// If etcd is managed, add additional checks for 'EtcdPodHealthy', 'EtcdMemberHealthy' conditions. -func HasUnhealthyControlPlaneMachineCondition(isEtcdManaged bool) Func { - controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ - controlplanev1.MachineAPIServerPodHealthyCondition, - controlplanev1.MachineControllerManagerPodHealthyCondition, - controlplanev1.MachineSchedulerPodHealthyCondition, - } - if isEtcdManaged { - controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions, - controlplanev1.MachineEtcdPodHealthyCondition, - controlplanev1.MachineEtcdMemberHealthyCondition, - ) - } - return func(machine *clusterv1.Machine) bool { - if !HasNode()(machine) { - return true - } - for _, condition := range controlPlaneMachineHealthConditions { - if conditions.IsFalse(machine, condition) { - return true - } - } - return false - } -} From 13b6eedc61fc88c6c1590da4081e1ad0cad37a35 Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Wed, 28 Feb 2024 09:40:29 +0800 Subject: [PATCH 04/14] address review comments and refine doc --- controlplane/kubeadm/internal/controllers/remediation_test.go | 2 -- docs/proposals/20191017-kubeadm-based-control-plane.md | 4 ++-- util/collections/machine_filters.go | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index 416ca604231d..9ed9f177b674 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -1885,8 +1885,6 @@ func withUnhealthyEtcdMember() machineOption { func withUnhealthyAPIServerPod() machineOption { return func(machine *clusterv1.Machine) { - newConditions := machine.Status.Conditions.DeepCopy() - machine.Status.Conditions = newConditions conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "") } } diff --git a/docs/proposals/20191017-kubeadm-based-control-plane.md b/docs/proposals/20191017-kubeadm-based-control-plane.md index c23b740e5cc7..26a1aaddb43c 100644 --- a/docs/proposals/20191017-kubeadm-based-control-plane.md +++ b/docs/proposals/20191017-kubeadm-based-control-plane.md @@ -401,9 +401,9 @@ spec: - See [Preflight checks](#preflight-checks) below. - Scale down operations removes the oldest machine in the failure domain that has the most control-plane machines on it. - Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down: - - outdatedMachines with the delete annotation + - outdated machines with the delete annotation - machines with the delete annotation - - outdatedMachines with unhealthy conditions + - outdated machines with unhealthy control plane component pods - outdated machines - all machines diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 13215a5db166..45a5825712b8 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -159,7 +159,7 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool { } // HasUnhealthyControlPlaneComponentCondition returns a filter to find all unhealthy control plane machines that -// does not have a Kubernetes node or have any of the follwing control plane component conditions set to False: +// does not have a Kubernetes node or have any of the following control plane component conditions set to False: // APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy(if using managed etcd). // It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions. func HasUnhealthyControlPlaneComponentCondition(isEtcdManaged bool) Func { From 2164685081bc615958f7f34b6b1c752d0e375a19 Mon Sep 17 00:00:00 2001 From: huangwei Date: Wed, 28 Feb 2024 10:24:35 +0800 Subject: [PATCH 05/14] address review comments --- controlplane/kubeadm/internal/controllers/remediation_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/remediation_test.go b/controlplane/kubeadm/internal/controllers/remediation_test.go index 9ed9f177b674..463b62f3b141 100644 --- a/controlplane/kubeadm/internal/controllers/remediation_test.go +++ b/controlplane/kubeadm/internal/controllers/remediation_test.go @@ -1877,8 +1877,6 @@ func withHealthyEtcdMember() machineOption { func withUnhealthyEtcdMember() machineOption { return func(machine *clusterv1.Machine) { - newConditions := machine.Status.Conditions.DeepCopy() - machine.Status.Conditions = newConditions conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "") } } From 63c028136d4c7fd769a681de06aab2ca9a598764 Mon Sep 17 00:00:00 2001 From: huangwei Date: Thu, 29 Feb 2024 11:51:08 +0800 Subject: [PATCH 06/14] fix imperfect check conditions and update ut --- .../internal/controllers/scale_test.go | 19 ++-- util/collections/machine_filters_test.go | 96 ++++++++++++++++++- 2 files changed, 105 insertions(+), 10 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index f1ea06ea6ccb..d35db435bdfe 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -368,15 +368,16 @@ func TestSelectMachineForScaleDown(t *testing.T) { Spec: controlplanev1.KubeadmControlPlaneSpec{}, } startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC) - - m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour))) - m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour))) - m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour))) - m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour))) - m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour))) - m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour))) - m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) - m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine")) + m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), machineOpt(withNodeRef("machine-1"))) + m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), machineOpt(withNodeRef("machine-2"))) + m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), machineOpt(withNodeRef("machine-3"))) + m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), machineOpt(withNodeRef("machine-4"))) + m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), machineOpt(withNodeRef("machine-5"))) + m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), machineOpt(withNodeRef("machine-6"))) + m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), + withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-7"))) + m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), + withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-8"))) m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), machineOpt(withNodeRef("machine-9"))) m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-4*time.Hour)), diff --git a/util/collections/machine_filters_test.go b/util/collections/machine_filters_test.go index bd7929c866e1..feee29c2de5b 100644 --- a/util/collections/machine_filters_test.go +++ b/util/collections/machine_filters_test.go @@ -378,7 +378,7 @@ func TestWithVersion(t *testing.T) { }) } -func TestHealtyAPIServer(t *testing.T) { +func TestHealthyAPIServer(t *testing.T) { t.Run("nil machine returns false", func(t *testing.T) { g := NewWithT(t) g.Expect(collections.HealthyAPIServer()(nil)).To(BeFalse()) @@ -454,6 +454,100 @@ func TestHasNode(t *testing.T) { }) } +func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { + t.Run("nil machine returns true", func(t *testing.T) { + g := NewWithT(t) + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(nil)).To(BeTrue()) + }) + + t.Run("machine without node returns true", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeTrue()) + }) + + t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + machine.Status.Conditions = clusterv1.Conditions{ + *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + } + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeFalse()) + }) + + t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + machine.Status.Conditions = clusterv1.Conditions{ + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + *conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "", + clusterv1.ConditionSeverityWarning, ""), + } + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeTrue()) + }) + + t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + machine.Status.Conditions = clusterv1.Conditions{ + *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + *conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "", + clusterv1.ConditionSeverityWarning, ""), + *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", + clusterv1.ConditionSeverityWarning, ""), + } + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeFalse()) + }) + + t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + machine.Status.Conditions = clusterv1.Conditions{ + *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + *conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "", + clusterv1.ConditionSeverityWarning, ""), + *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", + clusterv1.ConditionSeverityWarning, ""), + } + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(true)(machine)).To(BeTrue()) + }) + + t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) { + g := NewWithT(t) + machine := &clusterv1.Machine{} + machine.Status.NodeRef = &corev1.ObjectReference{ + Name: "node1", + } + machine.Status.Conditions = clusterv1.Conditions{ + *conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), + *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), + } + g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(true)(machine)).To(BeFalse()) + }) +} + func testControlPlaneMachine(name string) *clusterv1.Machine { owned := true ownedRef := []metav1.OwnerReference{ From 6730c1060d292a356f7477593718b9da609747a3 Mon Sep 17 00:00:00 2001 From: huangwei Date: Fri, 1 Mar 2024 10:20:13 +0800 Subject: [PATCH 07/14] add note --- util/collections/machine_filters.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 45a5825712b8..72639cec7d78 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -179,6 +179,9 @@ func HasUnhealthyControlPlaneComponentCondition(isEtcdManaged bool) Func { return true } for _, condition := range controlPlaneMachineHealthConditions { + // Do not return true when the condition is not set or is set to Unknown because + // it means a transient state and can not be considered as unhealthy. + // preflightCheckCondition() can cover these two cases and skip the scaling up/down. if conditions.IsFalse(machine, condition) { return true } From 5b1e9c8a03ba0a2146b6bea6f20624e4f1a16346 Mon Sep 17 00:00:00 2001 From: huangwei Date: Thu, 7 Mar 2024 15:02:00 +0800 Subject: [PATCH 08/14] redefine func name --- .../kubeadm/internal/control_plane.go | 27 ++++++++++--------- .../kubeadm/internal/control_plane_test.go | 4 +-- .../internal/controllers/remediation.go | 6 ++--- .../kubeadm/internal/controllers/scale.go | 4 +-- util/collections/machine_filters.go | 4 +-- util/collections/machine_filters_test.go | 14 +++++----- 6 files changed, 30 insertions(+), 29 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 773792a8b5da..f1df66386bd7 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -238,31 +238,32 @@ func (c *ControlPlane) IsEtcdManaged() bool { return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil } -// UnhealthyMachinesWithNonMHCUnhealthyCondition returns all unhealthy control plane machines that -// does not have a Kubernetes node or have any control plane component condition set to False. -// It is different from the UnhealthyMachines func which checks MachineHealthCheck conditions. -func (c *ControlPlane) UnhealthyMachinesWithNonMHCUnhealthyCondition(machines collections.Machines) collections.Machines { - return machines.Filter(collections.HasUnhealthyControlPlaneComponentCondition(c.IsEtcdManaged())) +// UnhealthyControlPlaneOrNodeMissingMachines returns all unhealthy control plane machines that +// do not have a Kubernetes node, or have unhealthy control plane components. +// +// It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions. +func (c *ControlPlane) UnhealthyControlPlaneOrNodeMissingMachines(machines collections.Machines) collections.Machines { + return machines.Filter(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(c.IsEtcdManaged())) } -// UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC. -func (c *ControlPlane) UnhealthyMachines() collections.Machines { +// UnhealthyMachinesByHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check. +func (c *ControlPlane) UnhealthyMachinesByHealthCheck() collections.Machines { return c.Machines.Filter(collections.HasUnhealthyCondition) } -// HealthyMachines returns the list of control plane machines not marked as unhealthy by MHC. -func (c *ControlPlane) HealthyMachines() collections.Machines { +// HealthyMachinesByHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check. +func (c *ControlPlane) HealthyMachinesByHealthCheck() collections.Machines { return c.Machines.Filter(collections.Not(collections.HasUnhealthyCondition)) } -// HasUnhealthyMachine returns true if any machine in the control plane is marked as unhealthy by MHC. -func (c *ControlPlane) HasUnhealthyMachine() bool { - return len(c.UnhealthyMachines()) > 0 +// HasUnhealthyMachineByHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check. +func (c *ControlPlane) HasUnhealthyMachineByHealthCheck() bool { + return len(c.UnhealthyMachinesByHealthCheck()) > 0 } // HasHealthyMachineStillProvisioning returns true if any healthy machine in the control plane is still in the process of being provisioned. func (c *ControlPlane) HasHealthyMachineStillProvisioning() bool { - return len(c.HealthyMachines().Filter(collections.Not(collections.HasNode()))) > 0 + return len(c.HealthyMachinesByHealthCheck().Filter(collections.Not(collections.HasNode()))) > 0 } // PatchMachines patches all the machines conditions. diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index f87a5ab9dbbf..c9b23d0b6f66 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -88,7 +88,7 @@ func TestHasUnhealthyMachine(t *testing.T) { } g := NewWithT(t) - g.Expect(c.HasUnhealthyMachine()).To(BeTrue()) + g.Expect(c.HasUnhealthyMachineByHealthCheck()).To(BeTrue()) }) t.Run("No unhealthy machine to be remediated by KCP", func(t *testing.T) { @@ -101,7 +101,7 @@ func TestHasUnhealthyMachine(t *testing.T) { } g := NewWithT(t) - g.Expect(c.HasUnhealthyMachine()).To(BeFalse()) + g.Expect(c.HasUnhealthyMachineByHealthCheck()).To(BeFalse()) }) } diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 94d153dbca25..009384885ed8 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -48,7 +48,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1) // if the underlying machine is now back to healthy / not deleting. errList := []error{} - healthyMachines := controlPlane.HealthyMachines() + healthyMachines := controlPlane.HealthyMachinesByHealthCheck() for _, m := range healthyMachines { if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && @@ -74,7 +74,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. - unhealthyMachines := controlPlane.UnhealthyMachines() + unhealthyMachines := controlPlane.UnhealthyMachinesByHealthCheck() // If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil). if len(unhealthyMachines) == 0 { @@ -192,7 +192,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. if controlPlane.IsEtcdManaged() { - etcdLeaderCandidate := controlPlane.HealthyMachines().Newest() + etcdLeaderCandidate := controlPlane.HealthyMachinesByHealthCheck().Newest() if etcdLeaderCandidate == nil { log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 8d4f43e64f9c..5938cae44363 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -230,8 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: machines = controlPlane.MachineWithDeleteAnnotation(machines) - case controlPlane.UnhealthyMachinesWithNonMHCUnhealthyCondition(outdatedMachines).Len() > 0: - machines = controlPlane.UnhealthyMachinesWithNonMHCUnhealthyCondition(outdatedMachines) + case controlPlane.UnhealthyControlPlaneOrNodeMissingMachines(outdatedMachines).Len() > 0: + machines = controlPlane.UnhealthyControlPlaneOrNodeMissingMachines(outdatedMachines) case outdatedMachines.Len() > 0: machines = outdatedMachines } diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 72639cec7d78..cef94ffd4b5a 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -158,11 +158,11 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool { return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) } -// HasUnhealthyControlPlaneComponentCondition returns a filter to find all unhealthy control plane machines that +// HasMissingNodeOrUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that // does not have a Kubernetes node or have any of the following control plane component conditions set to False: // APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy(if using managed etcd). // It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions. -func HasUnhealthyControlPlaneComponentCondition(isEtcdManaged bool) Func { +func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.MachineControllerManagerPodHealthyCondition, diff --git a/util/collections/machine_filters_test.go b/util/collections/machine_filters_test.go index feee29c2de5b..0d586a222db3 100644 --- a/util/collections/machine_filters_test.go +++ b/util/collections/machine_filters_test.go @@ -457,13 +457,13 @@ func TestHasNode(t *testing.T) { func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { t.Run("nil machine returns true", func(t *testing.T) { g := NewWithT(t) - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(nil)).To(BeTrue()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(nil)).To(BeTrue()) }) t.Run("machine without node returns true", func(t *testing.T) { g := NewWithT(t) machine := &clusterv1.Machine{} - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeTrue()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue()) }) t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) { @@ -477,7 +477,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), } - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeFalse()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) }) t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) { @@ -492,7 +492,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeTrue()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue()) }) t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) { @@ -510,7 +510,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(false)(machine)).To(BeFalse()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) }) t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) { @@ -528,7 +528,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(true)(machine)).To(BeTrue()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue()) }) t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) { @@ -544,7 +544,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), } - g.Expect(collections.HasUnhealthyControlPlaneComponentCondition(true)(machine)).To(BeFalse()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse()) }) } From 1ad1a86c5692c8524dce42078356eb7644ea2090 Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Fri, 5 Apr 2024 16:20:20 +0800 Subject: [PATCH 09/14] Update docs/proposals/20191017-kubeadm-based-control-plane.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stefan Büringer <4662360+sbueringer@users.noreply.github.com> --- docs/proposals/20191017-kubeadm-based-control-plane.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/proposals/20191017-kubeadm-based-control-plane.md b/docs/proposals/20191017-kubeadm-based-control-plane.md index 26a1aaddb43c..599be302e096 100644 --- a/docs/proposals/20191017-kubeadm-based-control-plane.md +++ b/docs/proposals/20191017-kubeadm-based-control-plane.md @@ -403,7 +403,7 @@ spec: - Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down: - outdated machines with the delete annotation - machines with the delete annotation - - outdated machines with unhealthy control plane component pods + - outdated machines with unhealthy control plane component pods or missing nodes - outdated machines - all machines From 0227b82c60cfc028feb97af49efa5f1d3c021524 Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Fri, 5 Apr 2024 16:21:46 +0800 Subject: [PATCH 10/14] Update util/collections/machine_filters.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Stefan Büringer <4662360+sbueringer@users.noreply.github.com> --- util/collections/machine_filters.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index cef94ffd4b5a..78ad95c616f1 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -160,7 +160,7 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool { // HasMissingNodeOrUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that // does not have a Kubernetes node or have any of the following control plane component conditions set to False: -// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy(if using managed etcd). +// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy & EtcdMemberHealthy (if using managed etcd). // It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions. func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ From 26d08b1b7317e9731e1925e14456f44328221ad9 Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Fri, 5 Apr 2024 16:28:30 +0800 Subject: [PATCH 11/14] rename func to UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents --- controlplane/kubeadm/internal/control_plane.go | 4 ++-- controlplane/kubeadm/internal/controllers/scale.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index f1df66386bd7..8ae251810ee5 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -238,11 +238,11 @@ func (c *ControlPlane) IsEtcdManaged() bool { return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil } -// UnhealthyControlPlaneOrNodeMissingMachines returns all unhealthy control plane machines that +// UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents returns all unhealthy control plane machines that // do not have a Kubernetes node, or have unhealthy control plane components. // // It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions. -func (c *ControlPlane) UnhealthyControlPlaneOrNodeMissingMachines(machines collections.Machines) collections.Machines { +func (c *ControlPlane) UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines { return machines.Filter(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(c.IsEtcdManaged())) } diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 5938cae44363..8a5243b82ef6 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -230,8 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: machines = controlPlane.MachineWithDeleteAnnotation(machines) - case controlPlane.UnhealthyControlPlaneOrNodeMissingMachines(outdatedMachines).Len() > 0: - machines = controlPlane.UnhealthyControlPlaneOrNodeMissingMachines(outdatedMachines) + case controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: + machines = controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines) case outdatedMachines.Len() > 0: machines = outdatedMachines } From fb628c0cbc3cda00801fb1d9a77d6b4c6d6f7c5f Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Fri, 5 Apr 2024 16:41:09 +0800 Subject: [PATCH 12/14] nil machine returns false --- util/collections/machine_filters.go | 3 +++ util/collections/machine_filters_test.go | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 78ad95c616f1..1c47ef6fdba1 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -175,6 +175,9 @@ func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { ) } return func(machine *clusterv1.Machine) bool { + if machine == nil { + return false + } if !HasNode()(machine) { return true } diff --git a/util/collections/machine_filters_test.go b/util/collections/machine_filters_test.go index 0d586a222db3..491db725795f 100644 --- a/util/collections/machine_filters_test.go +++ b/util/collections/machine_filters_test.go @@ -455,9 +455,9 @@ func TestHasNode(t *testing.T) { } func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { - t.Run("nil machine returns true", func(t *testing.T) { + t.Run("nil machine returns false", func(t *testing.T) { g := NewWithT(t) - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(nil)).To(BeTrue()) + g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse()) }) t.Run("machine without node returns true", func(t *testing.T) { From b3a54b95a31468cf6367831286402991cb9d417d Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Fri, 5 Apr 2024 16:54:37 +0800 Subject: [PATCH 13/14] rename func HealthyMachinesByHealthCheck to HealthyMachinesByMachineHealthCheck --- controlplane/kubeadm/internal/control_plane.go | 16 ++++++++-------- .../kubeadm/internal/control_plane_test.go | 4 ++-- .../kubeadm/internal/controllers/remediation.go | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 8ae251810ee5..b9d52ec82914 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -246,24 +246,24 @@ func (c *ControlPlane) UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneCo return machines.Filter(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(c.IsEtcdManaged())) } -// UnhealthyMachinesByHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check. -func (c *ControlPlane) UnhealthyMachinesByHealthCheck() collections.Machines { +// UnhealthyMachinesByMachineHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check. +func (c *ControlPlane) UnhealthyMachinesByMachineHealthCheck() collections.Machines { return c.Machines.Filter(collections.HasUnhealthyCondition) } -// HealthyMachinesByHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check. -func (c *ControlPlane) HealthyMachinesByHealthCheck() collections.Machines { +// HealthyMachinesByMachineHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check. +func (c *ControlPlane) HealthyMachinesByMachineHealthCheck() collections.Machines { return c.Machines.Filter(collections.Not(collections.HasUnhealthyCondition)) } -// HasUnhealthyMachineByHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check. -func (c *ControlPlane) HasUnhealthyMachineByHealthCheck() bool { - return len(c.UnhealthyMachinesByHealthCheck()) > 0 +// HasUnhealthyMachineByMachineHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check. +func (c *ControlPlane) HasUnhealthyMachineByMachineHealthCheck() bool { + return len(c.UnhealthyMachinesByMachineHealthCheck()) > 0 } // HasHealthyMachineStillProvisioning returns true if any healthy machine in the control plane is still in the process of being provisioned. func (c *ControlPlane) HasHealthyMachineStillProvisioning() bool { - return len(c.HealthyMachinesByHealthCheck().Filter(collections.Not(collections.HasNode()))) > 0 + return len(c.HealthyMachinesByMachineHealthCheck().Filter(collections.Not(collections.HasNode()))) > 0 } // PatchMachines patches all the machines conditions. diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index c9b23d0b6f66..6d6d50c82a4e 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -88,7 +88,7 @@ func TestHasUnhealthyMachine(t *testing.T) { } g := NewWithT(t) - g.Expect(c.HasUnhealthyMachineByHealthCheck()).To(BeTrue()) + g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeTrue()) }) t.Run("No unhealthy machine to be remediated by KCP", func(t *testing.T) { @@ -101,7 +101,7 @@ func TestHasUnhealthyMachine(t *testing.T) { } g := NewWithT(t) - g.Expect(c.HasUnhealthyMachineByHealthCheck()).To(BeFalse()) + g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeFalse()) }) } diff --git a/controlplane/kubeadm/internal/controllers/remediation.go b/controlplane/kubeadm/internal/controllers/remediation.go index 009384885ed8..28f54b5538a3 100644 --- a/controlplane/kubeadm/internal/controllers/remediation.go +++ b/controlplane/kubeadm/internal/controllers/remediation.go @@ -48,7 +48,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1) // if the underlying machine is now back to healthy / not deleting. errList := []error{} - healthyMachines := controlPlane.HealthyMachinesByHealthCheck() + healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck() for _, m := range healthyMachines { if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) && @@ -74,7 +74,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine) // and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation. - unhealthyMachines := controlPlane.UnhealthyMachinesByHealthCheck() + unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck() // If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil). if len(unhealthyMachines) == 0 { @@ -192,7 +192,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C // If the machine that is about to be deleted is the etcd leader, move it to the newest member available. if controlPlane.IsEtcdManaged() { - etcdLeaderCandidate := controlPlane.HealthyMachinesByHealthCheck().Newest() + etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest() if etcdLeaderCandidate == nil { log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to") conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning, From b892ae1982672a951789cd2d4a030b00df5d7a5e Mon Sep 17 00:00:00 2001 From: Jesse Hu Date: Thu, 11 Apr 2024 16:04:34 +0800 Subject: [PATCH 14/14] Remove !HasNode()(machine) condition from unhealthy machine filter --- controlplane/kubeadm/internal/control_plane.go | 9 ++++----- .../kubeadm/internal/controllers/scale.go | 4 ++-- .../20191017-kubeadm-based-control-plane.md | 2 +- util/collections/machine_filters.go | 13 +++++++------ util/collections/machine_filters_test.go | 16 ++++++++-------- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index b9d52ec82914..052802caacd4 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -238,12 +238,11 @@ func (c *ControlPlane) IsEtcdManaged() bool { return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil } -// UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents returns all unhealthy control plane machines that -// do not have a Kubernetes node, or have unhealthy control plane components. -// +// UnhealthyMachinesWithUnhealthyControlPlaneComponents returns all unhealthy control plane machines that +// have unhealthy control plane components. // It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions. -func (c *ControlPlane) UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines { - return machines.Filter(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(c.IsEtcdManaged())) +func (c *ControlPlane) UnhealthyMachinesWithUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines { + return machines.Filter(collections.HasUnhealthyControlPlaneComponents(c.IsEtcdManaged())) } // UnhealthyMachinesByMachineHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check. diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 8a5243b82ef6..8dc44889fe41 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -230,8 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: machines = controlPlane.MachineWithDeleteAnnotation(machines) - case controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: - machines = controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines) + case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: + machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines) case outdatedMachines.Len() > 0: machines = outdatedMachines } diff --git a/docs/proposals/20191017-kubeadm-based-control-plane.md b/docs/proposals/20191017-kubeadm-based-control-plane.md index 599be302e096..26a1aaddb43c 100644 --- a/docs/proposals/20191017-kubeadm-based-control-plane.md +++ b/docs/proposals/20191017-kubeadm-based-control-plane.md @@ -403,7 +403,7 @@ spec: - Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down: - outdated machines with the delete annotation - machines with the delete annotation - - outdated machines with unhealthy control plane component pods or missing nodes + - outdated machines with unhealthy control plane component pods - outdated machines - all machines diff --git a/util/collections/machine_filters.go b/util/collections/machine_filters.go index 1c47ef6fdba1..6c0c7813ad9a 100644 --- a/util/collections/machine_filters.go +++ b/util/collections/machine_filters.go @@ -158,11 +158,11 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool { return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) } -// HasMissingNodeOrUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that -// does not have a Kubernetes node or have any of the following control plane component conditions set to False: +// HasUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that +// have any of the following control plane component conditions set to False: // APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy & EtcdMemberHealthy (if using managed etcd). // It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions. -func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { +func HasUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { controlPlaneMachineHealthConditions := []clusterv1.ConditionType{ controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.MachineControllerManagerPodHealthyCondition, @@ -178,9 +178,10 @@ func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func { if machine == nil { return false } - if !HasNode()(machine) { - return true - } + + // The machine without a node could be in failure status due to the kubelet config error, or still provisioning components (including etcd). + // So do not treat it as unhealthy. + for _, condition := range controlPlaneMachineHealthConditions { // Do not return true when the condition is not set or is set to Unknown because // it means a transient state and can not be considered as unhealthy. diff --git a/util/collections/machine_filters_test.go b/util/collections/machine_filters_test.go index 491db725795f..d398f1f42532 100644 --- a/util/collections/machine_filters_test.go +++ b/util/collections/machine_filters_test.go @@ -457,13 +457,13 @@ func TestHasNode(t *testing.T) { func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { t.Run("nil machine returns false", func(t *testing.T) { g := NewWithT(t) - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse()) }) - t.Run("machine without node returns true", func(t *testing.T) { + t.Run("machine without node returns false", func(t *testing.T) { g := NewWithT(t) machine := &clusterv1.Machine{} - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) }) t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) { @@ -477,7 +477,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition), *conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition), } - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) }) t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) { @@ -492,7 +492,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue()) }) t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) { @@ -510,7 +510,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse()) }) t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) { @@ -528,7 +528,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "", clusterv1.ConditionSeverityWarning, ""), } - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue()) }) t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) { @@ -544,7 +544,7 @@ func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) { *conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition), *conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition), } - g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse()) + g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse()) }) }