Skip to content

Commit

Permalink
Fix waiting for all kube-system pods having one of specified labels t…
Browse files Browse the repository at this point in the history
…o be Ready
  • Loading branch information
prezha committed Jan 31, 2025
1 parent 5f3b960 commit 7fcbacd
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 15 deletions.
34 changes: 21 additions & 13 deletions pkg/minikube/bootstrapper/bsutil/kverify/pod_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package kverify

import (
"context"
"errors"
"fmt"
"time"

Expand All @@ -33,10 +32,10 @@ import (

// WaitExtra calls waitPodCondition for all (at least one) kube-system pods having one of specified labels to be "Ready".
func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration) error {
klog.Infof("extra waiting up to %v for all kube-system pods having one of %v labels to be %q ...", timeout, labels, core.PodReady)
klog.Infof("extra waiting up to %v for all %q pods having one of %v labels to be %q ...", timeout, meta.NamespaceSystem, labels, core.PodReady)
start := time.Now()
defer func() {
klog.Infof("duration metric: took %s for extra waiting for all kube-system pods having one of %v labels to be %q ...", time.Since(start), labels, core.PodReady)
klog.Infof("duration metric: took %s for extra waiting for all %q pods having one of %v labels to be %q ...", time.Since(start), meta.NamespaceSystem, labels, core.PodReady)
}()

ctx, cancel := context.WithTimeout(context.Background(), timeout)
Expand All @@ -56,7 +55,7 @@ func WaitExtra(cs *kubernetes.Clientset, labels []string, timeout time.Duration)
}
for _, pod := range pods.Items {
if err := waitPodCondition(ctx, cs, pod.Name, pod.Namespace, core.PodReady); err != nil {
klog.Warningf("pods in %q namespace with %q label not %q, will retry: %v", meta.NamespaceSystem, label, core.PodReady, err)
klog.Warningf("not all pods in %q namespace with %q label are %q, will retry: %v", meta.NamespaceSystem, label, core.PodReady, err)
return false, nil
}
}
Expand All @@ -81,21 +80,30 @@ func waitPodCondition(ctx context.Context, cs *kubernetes.Clientset, name, names
klog.Infof("duration metric: took %s for pod %q in %q namespace to be %q or be gone ...", time.Since(start), name, namespace, condition)
}()

lap := time.Now()
checkCondition := func(_ context.Context) (bool, error) {
status, reason := podConditionStatus(cs, name, namespace, condition)
// ok or skip if status == core.TaintNodeNotReady - we check node healt elsewhere
if status == core.ConditionTrue || status == core.TaintNodeNotReady {
// done if pod is ready
if status == core.ConditionTrue {
klog.Info(reason)
return true, nil
}
// fail: status == core.ConditionUnknown
if status == core.ConditionUnknown {
return false, errors.New(reason)

// back off if pod condition is unknown or node is not ready - we check node healt elsewhere
if status == core.ConditionUnknown || status == core.TaintNodeNotReady {
klog.Warning(reason)
return true, nil
}

// retry in all other cases (eg, node not ready, pod pending, pod not ready, etc.)
// decrease log spam
if time.Since(lap) > (2 * time.Second) {
klog.Warning(reason)
lap = time.Now()
}
// retry: status == core.ConditionFalse
return false, errors.New(reason)
return false, nil
}
if err := wait.PollUntilContextCancel(ctx, kconst.APICallRetryInterval, true, checkCondition); err != nil {
if err := wait.PollUntilContextCancel(ctx, kconst.APICallRetryInterval, false, checkCondition); err != nil {
return fmt.Errorf("waitPodCondition: %w", err)
}

Expand All @@ -112,7 +120,7 @@ func podConditionStatus(cs *kubernetes.Clientset, name, namespace string, condit
// check if undelying node is Ready - skip in case we got stale data about the pod
if pod.Spec.NodeName != "" {
if status, reason := nodeConditionStatus(cs, pod.Spec.NodeName, core.NodeReady); status != core.ConditionTrue {
return core.TaintNodeNotReady, fmt.Sprintf("node %q hosting pod %q in %q namespace is currently not %q (skipping!): %v", pod.Spec.NodeName, name, namespace, core.NodeReady, reason)
return core.TaintNodeNotReady, fmt.Sprintf("node %q hosting pod %q in %q namespace is not %q (skipping!): %v", pod.Spec.NodeName, name, namespace, core.NodeReady, reason)
}
}

Expand Down
7 changes: 5 additions & 2 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ func (k *Bootstrapper) WaitForNode(cfg config.ClusterConfig, n config.Node, time
}

// restartPrimaryControlPlane restarts the kubernetes cluster configured by kubeadm.
func (k *Bootstrapper) restartPrimaryControlPlane(cfg config.ClusterConfig) error {
func (k *Bootstrapper) restartPrimaryControlPlane(cfg config.ClusterConfig) error { //nolint: gocyclo
klog.Infof("restartPrimaryControlPlane start ...")

start := time.Now()
Expand Down Expand Up @@ -739,7 +739,10 @@ func (k *Bootstrapper) restartPrimaryControlPlane(cfg config.ClusterConfig) erro
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))

if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil {
// for ha (multi-control plane) cluster, primary control-plane node (and pods scheduled there) will not come up alone until secondary joins
if config.IsHA(cfg) {
klog.Infof("HA (multi-control plane) cluster: will skip waiting for pods on primary control-plane node %+v", pcp)
} else if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
}
Expand Down

0 comments on commit 7fcbacd

Please sign in to comment.