Skip to content

Commit

Permalink
Wait to requeue if healthcheck fails
Browse files Browse the repository at this point in the history
Signed-off-by: Daniel Lipovetsky <dlipovetsky@d2iq.com>
  • Loading branch information
dlipovetsky committed Feb 14, 2020
1 parent 8469cbc commit 57a9876
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ const (
// DeleteRequeueAfter is how long to wait before checking again to see if
// all control plane machines have been deleted.
DeleteRequeueAfter = 30 * time.Second

// HealthCheckFailedRequeueAfter is how long to wait before trying to scale
// up/down if some target cluster health check has failed
HealthCheckFailedRequeueAfter = 20 * time.Second
)

type managementCluster interface {
Expand Down Expand Up @@ -217,7 +221,6 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
if requeueErr, ok := errors.Cause(err).(capierrors.HasRequeueAfterError); ok {
logger.Error(err, "required certificates not found, requeueing")
return ctrl.Result{
Requeue: true,
RequeueAfter: requeueErr.GetRequeueAfter(),
}, nil
}
Expand Down Expand Up @@ -260,36 +263,31 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, cluster *
case numMachines < desiredReplicas && numMachines == 0:
// Create new Machine w/ init
logger.Info("Initializing control plane", "Desired", desiredReplicas, "Existing", numMachines)
if err := r.initializeControlPlane(ctx, cluster, kcp); err != nil {
result, err := r.initializeControlPlane(ctx, cluster, kcp)
if err != nil {
logger.Error(err, "Failed to initialize control plane")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedInitialization", "Failed to initialize cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
return ctrl.Result{}, err
}
numMachines++
return result, err
// We are scaling up
case numMachines < desiredReplicas && numMachines > 0:
// Create a new Machine w/ join
logger.Info("Scaling up control plane", "Desired", desiredReplicas, "Existing", numMachines)
if err := r.scaleUpControlPlane(ctx, cluster, kcp); err != nil {
result, err := r.scaleUpControlPlane(ctx, cluster, kcp)
if err != nil {
logger.Error(err, "Failed to scale up control plane")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedScaleUp", "Failed to scale up cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
return ctrl.Result{}, err
}
numMachines++
return result, err
// We are scaling down
case numMachines > desiredReplicas:
logger.Info("Scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)
if err := r.scaleDownControlPlane(ctx, cluster, kcp); err != nil {
result, err := r.scaleDownControlPlane(ctx, cluster, kcp)
if err != nil {
logger.Error(err, "Failed to scale down control plane")
r.recorder.Eventf(kcp, corev1.EventTypeWarning, "FailedScaleDown", "Failed to scale down cluster %s/%s control plane: %v", cluster.Namespace, cluster.Name, err)
return ctrl.Result{}, err
}
numMachines--
}

// Requeue to continue scaling up/down
if numMachines != desiredReplicas {
return ctrl.Result{Requeue: true}, nil
return result, err
}

return ctrl.Result{}, nil
Expand Down Expand Up @@ -363,24 +361,24 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(ctx context.Context,
return nil
}

func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) error {
func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
bootstrapSpec := kcp.Spec.KubeadmConfigSpec.DeepCopy()
bootstrapSpec.JoinConfiguration = nil

if err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec); err != nil {
return errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
return ctrl.Result{}, errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
}

return nil
return ctrl.Result{Requeue: true}, nil
}

func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) error {
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
return errors.Wrapf(err, "cluster %s/%s control plane is not healthy", cluster.Name, cluster.Namespace)
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "control plane is not healthy")
}

if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
return errors.Wrapf(err, "cluster %s/%s etcd cluster is not healthy", cluster.Name, cluster.Namespace)
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "etcd cluster is not healthy")
}

// Create the bootstrap configuration
Expand All @@ -389,41 +387,41 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context,
bootstrapSpec.ClusterConfiguration = nil

if err := r.cloneConfigsAndGenerateMachine(ctx, cluster, kcp, bootstrapSpec); err != nil {
return errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
return ctrl.Result{}, errors.Wrapf(err, "failed to create control plane Machine for cluster %s/%s", cluster.Name, cluster.Namespace)
}

return nil
return ctrl.Result{Requeue: true}, nil
}

func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) error {
func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane) (ctrl.Result, error) {
if err := r.managementCluster.TargetClusterControlPlaneIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
return errors.Wrapf(err, "control plane is not healthy")
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "control plane is not healthy")
}

if err := r.managementCluster.TargetClusterEtcdIsHealthy(ctx, clusterKey(cluster), kcp.Name); err != nil {
return errors.Wrapf(err, "etcd cluster is not healthy")
return ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}, errors.Wrap(err, "etcd cluster is not healthy")
}

ownedMachines, err := r.managementCluster.GetMachinesForCluster(ctx, clusterKey(cluster), internal.OwnedControlPlaneMachines(kcp.Name))
if err != nil {
return err
return ctrl.Result{}, err
}

// Wait for any delete in progress to complete before deleting another Machine
if len(internal.FilterMachines(ownedMachines, internal.HasDeletionTimestamp())) > 0 {
return nil
return ctrl.Result{RequeueAfter: DeleteRequeueAfter}, nil
}

machineToDelete, err := oldestMachine(ownedMachines)
if err != nil {
return errors.Wrap(err, "failed to pick control plane Machine to delete")
return ctrl.Result{}, errors.Wrap(err, "failed to pick control plane Machine to delete")
}

if err := r.Client.Delete(ctx, &machineToDelete); err != nil && !apierrors.IsNotFound(err) {
return errors.Wrapf(err, "failed to delete control plane Machine %s/%s", machineToDelete.Namespace, machineToDelete.Name)
return ctrl.Result{}, errors.Wrapf(err, "failed to delete control plane Machine %s/%s", machineToDelete.Namespace, machineToDelete.Name)
}

return nil
return ctrl.Result{Requeue: true}, nil
}

func (r *KubeadmControlPlaneReconciler) cloneConfigsAndGenerateMachine(ctx context.Context, cluster *clusterv1.Cluster, kcp *controlplanev1.KubeadmControlPlane, bootstrapSpec *bootstrapv1.KubeadmConfigSpec) error {
Expand Down Expand Up @@ -615,7 +613,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, clu
if errs != nil {
return ctrl.Result{}, kerrors.NewAggregate(errs)
}
return ctrl.Result{Requeue: true, RequeueAfter: DeleteRequeueAfter}, nil
return ctrl.Result{RequeueAfter: DeleteRequeueAfter}, nil
}

func (r *KubeadmControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, clusterName types.NamespacedName, endpoint clusterv1.APIEndpoint, kcp *controlplanev1.KubeadmControlPlane) error {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,9 @@ func TestKubeadmControlPlaneReconciler_initializeControlPlane(t *testing.T) {
Log: log.Log,
}

g.Expect(r.initializeControlPlane(context.Background(), cluster, kcp)).To(Succeed())
result, err := r.initializeControlPlane(context.Background(), cluster, kcp)
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
g.Expect(err).NotTo(HaveOccurred())

machineList := &clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), machineList, client.InNamespace(cluster.Namespace))).To(Succeed())
Expand Down Expand Up @@ -618,7 +620,7 @@ func TestReconcileInitializeControlPlane(t *testing.T) {

result, err := r.Reconcile(ctrl.Request{NamespacedName: types.NamespacedName{Name: kcp.Name, Namespace: kcp.Namespace}})
g.Expect(err).NotTo(HaveOccurred())
g.Expect(result).To(Equal(ctrl.Result{}))
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
g.Expect(r.Client.Get(context.Background(), types.NamespacedName{Name: kcp.Name, Namespace: kcp.Namespace}, kcp)).To(Succeed())

// Expect the referenced infrastructure template to have a Cluster Owner Reference.
Expand Down Expand Up @@ -1234,7 +1236,7 @@ func TestKubeadmControlPlaneReconciler_reconcileDelete(t *testing.T) {
}

result, err := r.reconcileDelete(context.Background(), cluster, kcp, log.Log)
g.Expect(result).To(Equal(ctrl.Result{Requeue: true, RequeueAfter: DeleteRequeueAfter}))
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: DeleteRequeueAfter}))
g.Expect(err).To(BeNil())
g.Expect(kcp.Finalizers).To(ContainElement(controlplanev1.KubeadmControlPlaneFinalizer))

Expand Down Expand Up @@ -1356,7 +1358,9 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {
managementCluster: fmc,
}

g.Expect(r.scaleUpControlPlane(context.Background(), cluster, kcp)).To(Succeed())
result, err := r.scaleUpControlPlane(context.Background(), cluster, kcp)
g.Expect(result).To(Equal(ctrl.Result{Requeue: true}))
g.Expect(err).ToNot(HaveOccurred())

controlPlaneMachines := clusterv1.MachineList{}
g.Expect(fakeClient.List(context.Background(), &controlPlaneMachines)).To(Succeed())
Expand All @@ -1373,11 +1377,16 @@ func TestKubeadmControlPlaneReconciler_scaleUpControlPlane(t *testing.T) {

fmc.ControlPlaneHealthy = true
fmc.EtcdHealthy = false
g.Expect(r.scaleUpControlPlane(context.Background(), &clusterv1.Cluster{}, &controlplanev1.KubeadmControlPlane{})).NotTo(Succeed())
result, err := r.scaleUpControlPlane(context.Background(), &clusterv1.Cluster{}, &controlplanev1.KubeadmControlPlane{})
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}))
g.Expect(err).To(HaveOccurred())

fmc.ControlPlaneHealthy = false
fmc.EtcdHealthy = true
g.Expect(r.scaleUpControlPlane(context.Background(), &clusterv1.Cluster{}, &controlplanev1.KubeadmControlPlane{})).NotTo(Succeed())
result, err = r.scaleUpControlPlane(context.Background(), &clusterv1.Cluster{}, &controlplanev1.KubeadmControlPlane{})
g.Expect(result).To(Equal(ctrl.Result{RequeueAfter: HealthCheckFailedRequeueAfter}))
g.Expect(err).To(HaveOccurred())

})
}

Expand Down

0 comments on commit 57a9876

Please sign in to comment.