Skip to content

Commit

Permalink
Ensure that etcd health check errors are logged.
Browse files Browse the repository at this point in the history
Update controlplane/kubeadm/controllers/controller.go

Co-authored-by: Vince Prignano <vince@vincepri.com>
  • Loading branch information
Arvinderpal and vincepri committed Jul 28, 2020
1 parent 2d74380 commit 93797cb
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 18 deletions.
21 changes: 9 additions & 12 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -421,32 +421,29 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
// It removes any etcd members that do not have a corresponding node.
// Also, as a final step, checks if there is any machines that is being deleted.
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
logger := controlPlane.Logger()

// Do a health check of the Control Plane components
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
return ctrl.Result{}, errors.Wrap(err, "Waiting for control plane to pass control plane health check to continue reconciliation")
}

// Ensure etcd is healthy
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
errList := []error{}
errList = append(errList, errors.Wrap(err, "Waiting for control plane to pass etcd health check to continue reconciliation"))
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
// If there are any etcd members that do not have corresponding nodes, remove them from etcd and from the kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
}
if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
logger.V(2).Info("Failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation", "cause", err)
errList = append(errList, errors.Wrap(err, "cannot get remote client to workload cluster"))
} else if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
errList = append(errList, errors.Wrap(err, "Failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation"))
}

logger.V(2).Info("Waiting for control plane to pass etcd health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
return ctrl.Result{}, kerrors.NewAggregate(errList)
}

// We need this check for scale up as well as down to avoid scaling up when there is a machine being deleted.
Expand Down
5 changes: 2 additions & 3 deletions controlplane/kubeadm/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,8 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
Machines: beforeMachines,
}

result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
g.Expect(err).To(BeNil())
_, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
g.Expect(err).To(HaveOccurred())

controlPlaneMachines := &clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
Expand Down
5 changes: 2 additions & 3 deletions controlplane/kubeadm/controllers/upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,8 @@ func TestKubeadmControlPlaneReconciler_upgradeControlPlane(t *testing.T) {

// run upgrade a second time, simulate that the node has not appeared yet but the machine exists
r.managementCluster.(*fakeManagementCluster).ControlPlaneHealthy = false
result, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
g.Expect(err).To(BeNil())
_, err = r.upgradeControlPlane(context.Background(), cluster, kcp, controlPlane, needingUpgrade)
g.Expect(err).To(HaveOccurred())
g.Expect(fakeClient.List(context.Background(), bothMachines, client.InNamespace(cluster.Namespace))).To(Succeed())
g.Expect(bothMachines.Items).To(HaveLen(2))

Expand Down

0 comments on commit 93797cb

Please sign in to comment.