Skip to content

Commit

Permalink
Merge pull request #3336 from Nordix/kcp-log-etcd-errors
Browse files Browse the repository at this point in the history
🐛 Ensure that etcd health check errors are logged in KCP
  • Loading branch information
k8s-ci-robot committed Aug 28, 2020
2 parents 6ee07b3 + 3ad74fb commit 0a5f988
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
18 changes: 7 additions & 11 deletions controlplane/kubeadm/controllers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -446,11 +446,9 @@ func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(o handler.M
// It removes any etcd members that do not have a corresponding node.
// Also, as a final step, checks if there is any machines that is being deleted.
func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
logger := controlPlane.Logger()

// Do a health check of the Control Plane components
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
logger.V(2).Info("Waiting for control plane to pass control plane health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass control plane health check to continue reconciliation: %v", err)
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
Expand All @@ -459,20 +457,18 @@ func (r *KubeadmControlPlaneReconciler) reconcileHealth(ctx context.Context, clu
// If KCP should manage etcd, ensure etcd is healthy.
if controlPlane.IsEtcdManaged() {
if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, util.ObjectKey(cluster)); err != nil {
errList := []error{errors.Wrap(err, "failed to pass etcd health check")}
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
// If there are any etcd members that do not have corresponding nodes, remove them from etcd and from the kubeadm configmap.
// This will solve issues related to manual control-plane machine deletion.
workloadCluster, err := r.managementCluster.GetWorkloadCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return ctrl.Result{}, err
}
if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
logger.V(2).Info("Failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation", "cause", err)
errList = append(errList, errors.Wrap(err, "cannot get remote client to workload cluster"))
} else if err := workloadCluster.ReconcileEtcdMembers(ctx); err != nil {
errList = append(errList, errors.Wrap(err, "failed attempt to remove potential hanging etcd members to pass etcd health check to continue reconciliation"))
}

logger.V(2).Info("Waiting for control plane to pass etcd health check to continue reconciliation", "cause", err)
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
"Waiting for control plane to pass etcd health check to continue reconciliation: %v", err)
return ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}, nil
return ctrl.Result{}, kerrors.NewAggregate(errList)
}
}

Expand Down
12 changes: 10 additions & 2 deletions controlplane/kubeadm/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,20 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
name string
etcdUnHealthy bool
controlPlaneUnHealthy bool
expectErr bool
expectResult ctrl.Result
}{
{
name: "etcd health check fails",
etcdUnHealthy: true,
expectErr: true,
expectResult: ctrl.Result{},
},
{
name: "controlplane component health check fails",
controlPlaneUnHealthy: true,
expectErr: false,
expectResult: ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter},
},
}
for _, tc := range testCases {
Expand All @@ -171,8 +177,10 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
}

result, err := r.scaleUpControlPlane(context.Background(), cluster.DeepCopy(), kcp.DeepCopy(), controlPlane)
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: healthCheckFailedRequeueAfter}))
g.Expect(err).To(BeNil())
if tc.expectErr {
g.Expect(err).To(HaveOccurred())
}
g.Expect(result).To(Equal(tc.expectResult))

controlPlaneMachines := &clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), controlPlaneMachines)).To(Succeed())
Expand Down

0 comments on commit 0a5f988

Please sign in to comment.