diff --git a/pkg/exporters/k8sexporter/k8s_exporter.go b/pkg/exporters/k8sexporter/k8s_exporter.go index 141e30129..daf42e5b7 100644 --- a/pkg/exporters/k8sexporter/k8s_exporter.go +++ b/pkg/exporters/k8sexporter/k8s_exporter.go @@ -105,13 +105,13 @@ func (ke *k8sExporter) startHTTPReporting(npdo *options.NodeProblemDetectorOptio } func waitForAPIServerReadyWithTimeout(ctx context.Context, c problemclient.Client, npdo *options.NodeProblemDetectorOptions) error { - return wait.PollImmediate(npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout, func() (done bool, err error) { + return wait.PollUntilContextTimeout(ctx, npdo.APIServerWaitInterval, npdo.APIServerWaitTimeout,true, func(ctx context.Context) (done bool,err error) { // If NPD can get the node object from kube-apiserver, the server is // ready and the RBAC permission is set correctly. if _, err := c.GetNode(ctx); err != nil { klog.Errorf("Can't get node object: %v", err) - return false, nil + return false,err } - return true, nil + return true,nil }) } diff --git a/pkg/exporters/k8sexporter/problemclient/problem_client.go b/pkg/exporters/k8sexporter/problemclient/problem_client.go index fede9bf34..40ee134f8 100644 --- a/pkg/exporters/k8sexporter/problemclient/problem_client.go +++ b/pkg/exporters/k8sexporter/problemclient/problem_client.go @@ -31,6 +31,7 @@ import ( clientset "k8s.io/client-go/kubernetes" typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" "k8s.io/klog/v2" "k8s.io/utils/clock" @@ -107,7 +108,15 @@ func (c *nodeProblemClient) SetConditions(ctx context.Context, newConditions []v if err != nil { return err } - return c.client.RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(c.nodeName).SubResource("status").Body(patch).Do(ctx).Error() + return retry.OnError(retry.DefaultRetry, + func(error) bool{ + return true + }, + func() error { + _, err := c.client.Nodes().PatchStatus(ctx, c.nodeName, patch) + return err + }, + ) } func (c *nodeProblemClient) Eventf(eventType, source, reason, messageFmt string, args ...interface{}) { diff --git a/vendor/k8s.io/client-go/util/retry/OWNERS b/vendor/k8s.io/client-go/util/retry/OWNERS new file mode 100644 index 000000000..75736b5aa --- /dev/null +++ b/vendor/k8s.io/client-go/util/retry/OWNERS @@ -0,0 +1,4 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +reviewers: + - caesarxuchao diff --git a/vendor/k8s.io/client-go/util/retry/util.go b/vendor/k8s.io/client-go/util/retry/util.go new file mode 100644 index 000000000..0c6e504a6 --- /dev/null +++ b/vendor/k8s.io/client-go/util/retry/util.go @@ -0,0 +1,105 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package retry + +import ( + "time" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/util/wait" +) + +// DefaultRetry is the recommended retry for a conflict where multiple clients +// are making changes to the same resource. +var DefaultRetry = wait.Backoff{ + Steps: 5, + Duration: 10 * time.Millisecond, + Factor: 1.0, + Jitter: 0.1, +} + +// DefaultBackoff is the recommended backoff for a conflict where a client +// may be attempting to make an unrelated modification to a resource under +// active management by one or more controllers. +var DefaultBackoff = wait.Backoff{ + Steps: 4, + Duration: 10 * time.Millisecond, + Factor: 5.0, + Jitter: 0.1, +} + +// OnError allows the caller to retry fn in case the error returned by fn is retriable +// according to the provided function. backoff defines the maximum retries and the wait +// interval between two retries. +func OnError(backoff wait.Backoff, retriable func(error) bool, fn func() error) error { + var lastErr error + err := wait.ExponentialBackoff(backoff, func() (bool, error) { + err := fn() + switch { + case err == nil: + return true, nil + case retriable(err): + lastErr = err + return false, nil + default: + return false, err + } + }) + if err == wait.ErrWaitTimeout { + err = lastErr + } + return err +} + +// RetryOnConflict is used to make an update to a resource when you have to worry about +// conflicts caused by other code making unrelated updates to the resource at the same +// time. fn should fetch the resource to be modified, make appropriate changes to it, try +// to update it, and return (unmodified) the error from the update function. On a +// successful update, RetryOnConflict will return nil. If the update function returns a +// "Conflict" error, RetryOnConflict will wait some amount of time as described by +// backoff, and then try again. On a non-"Conflict" error, or if it retries too many times +// and gives up, RetryOnConflict will return an error to the caller. +// +// err := retry.RetryOnConflict(retry.DefaultRetry, func() error { +// // Fetch the resource here; you need to refetch it on every try, since +// // if you got a conflict on the last update attempt then you need to get +// // the current version before making your own changes. +// pod, err := c.Pods("mynamespace").Get(name, metav1.GetOptions{}) +// if err != nil { +// return err +// } +// +// // Make whatever updates to the resource are needed +// pod.Status.Phase = v1.PodFailed +// +// // Try to update +// _, err = c.Pods("mynamespace").UpdateStatus(pod) +// // You have to return err itself here (not wrapped inside another error) +// // so that RetryOnConflict can identify it correctly. +// return err +// }) +// if err != nil { +// // May be conflict if max retries were hit, or may be something unrelated +// // like permissions or a network error +// return err +// } +// ... +// +// TODO: Make Backoff an interface? +func RetryOnConflict(backoff wait.Backoff, fn func() error) error { + return OnError(backoff, errors.IsConflict, fn) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index e42c65dee..e6c0147b0 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -762,6 +762,7 @@ k8s.io/client-go/util/connrotation k8s.io/client-go/util/flowcontrol k8s.io/client-go/util/homedir k8s.io/client-go/util/keyutil +k8s.io/client-go/util/retry k8s.io/client-go/util/workqueue # k8s.io/klog/v2 v2.100.1 ## explicit; go 1.13