From 7d67e9f520e7b812d687c032c6d159b28fb6c50d Mon Sep 17 00:00:00 2001 From: David Eads Date: Tue, 9 Mar 2021 10:29:50 -0500 Subject: [PATCH] UPSTREAM: : provide events, messages, and bodies for probe failures of important pods UPSTREAM: : provide unique reason for pod probe event during termination OpenShift-Rebase-Source: 01542fcf1c4 --- pkg/kubelet/prober/patch_prober.go | 56 ++++++++++++++++++++++++++++++ pkg/kubelet/prober/prober.go | 2 +- pkg/probe/http/http.go | 16 +++++---- pkg/probe/http/patch_http.go | 25 +++++++++++++ 4 files changed, 91 insertions(+), 8 deletions(-) create mode 100644 pkg/kubelet/prober/patch_prober.go create mode 100644 pkg/probe/http/patch_http.go diff --git a/pkg/kubelet/prober/patch_prober.go b/pkg/kubelet/prober/patch_prober.go new file mode 100644 index 0000000000000..02add8ae82512 --- /dev/null +++ b/pkg/kubelet/prober/patch_prober.go @@ -0,0 +1,56 @@ +package prober + +import ( + "net/http" + "strings" + "time" + + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + "k8s.io/kubernetes/pkg/probe" + httpprobe "k8s.io/kubernetes/pkg/probe/http" +) + +func (pb *prober) maybeProbeForBody(prober httpprobe.Prober, req *http.Request, timeout time.Duration, pod *v1.Pod, container v1.Container, probeType probeType) (probe.Result, string, error) { + if !isInterestingPod(pod) { + return prober.Probe(req, timeout) + } + bodyProber, ok := prober.(httpprobe.DetailedProber) + if !ok { + return prober.Probe(req, timeout) + } + result, output, body, probeError := bodyProber.ProbeForBody(req, timeout) + switch result { + case probe.Success: + return result, output, probeError + case probe.Warning, probe.Failure, probe.Unknown: + // these pods are interesting enough to show the body content + klog.Infof("interesting pod/%s container/%s namespace/%s: %s probe status=%v output=%q start-of-body=%s", + pod.Name, container.Name, pod.Namespace, probeType, result, output, body) + + reason := "ProbeError" // this is the normal value + if pod.DeletionTimestamp != nil { + // If the container was sent a sig-term, we want to have a different reason so we can distinguish this in our + // monitoring and watching code. + // Pod delete does this, but there are other possible reasons as well. We'll start with pod delete to improve the state of the world. + reason = "TerminatingPodProbeError" + } + + // in fact, they are so interesting we'll try to send events for them + pb.recordContainerEvent(pod, &container, v1.EventTypeWarning, reason, "%s probe error: %s\nbody: %s\n", probeType, output, body) + return result, output, probeError + default: + return result, output, probeError + } +} + +func isInterestingPod(pod *v1.Pod) bool { + if pod == nil { + return false + } + if strings.HasPrefix(pod.Namespace, "openshift-") { + return true + } + + return false +} diff --git a/pkg/kubelet/prober/prober.go b/pkg/kubelet/prober/prober.go index c1936db2efa7a..96b3913bd7664 100644 --- a/pkg/kubelet/prober/prober.go +++ b/pkg/kubelet/prober/prober.go @@ -154,7 +154,7 @@ func (pb *prober) runProbe(ctx context.Context, probeType probeType, p *v1.Probe headers := p.HTTPGet.HTTPHeaders klogV4.InfoS("HTTP-Probe", "scheme", scheme, "host", host, "port", port, "path", path, "timeout", timeout, "headers", headers) } - return pb.http.Probe(req, timeout) + return pb.maybeProbeForBody(pb.http, req, timeout, pod, container, probeType) case p.TCPSocket != nil: port, err := probe.ResolveContainerPort(p.TCPSocket.Port, &container) diff --git a/pkg/probe/http/http.go b/pkg/probe/http/http.go index 20e33da8ed478..d74c6776e000e 100644 --- a/pkg/probe/http/http.go +++ b/pkg/probe/http/http.go @@ -78,7 +78,8 @@ func (pr httpProber) Probe(req *http.Request, timeout time.Duration) (probe.Resu Transport: pr.transport, CheckRedirect: RedirectChecker(pr.followNonLocalRedirects), } - return DoHTTPProbe(req, client) + result, details, _, err := DoHTTPProbe(req, client) + return result, details, err } // GetHTTPInterface is an interface for making HTTP requests, that returns a response and error. @@ -90,13 +91,13 @@ type GetHTTPInterface interface { // If the HTTP response code is successful (i.e. 400 > code >= 200), it returns Success. // If the HTTP response code is unsuccessful or HTTP communication fails, it returns Failure. // This is exported because some other packages may want to do direct HTTP probes. -func DoHTTPProbe(req *http.Request, client GetHTTPInterface) (probe.Result, string, error) { +func DoHTTPProbe(req *http.Request, client GetHTTPInterface) (probe.Result, string, string, error) { url := req.URL headers := req.Header res, err := client.Do(req) if err != nil { // Convert errors into failures to catch timeouts. - return probe.Failure, err.Error(), nil + return probe.Failure, err.Error(), "", nil } defer res.Body.Close() b, err := utilio.ReadAtMost(res.Body, maxRespBodyLength) @@ -104,22 +105,23 @@ func DoHTTPProbe(req *http.Request, client GetHTTPInterface) (probe.Result, stri if err == utilio.ErrLimitReached { klog.V(4).Infof("Non fatal body truncation for %s, Response: %v", url.String(), *res) } else { - return probe.Failure, "", err + return probe.Failure, "", "", err } } body := string(b) if res.StatusCode >= http.StatusOK && res.StatusCode < http.StatusBadRequest { if res.StatusCode >= http.StatusMultipleChoices { // Redirect klog.V(4).Infof("Probe terminated redirects for %s, Response: %v", url.String(), *res) - return probe.Warning, fmt.Sprintf("Probe terminated redirects, Response body: %v", body), nil + return probe.Warning, fmt.Sprintf("Probe terminated redirects, Response body: %v", body), body, nil } klog.V(4).Infof("Probe succeeded for %s, Response: %v", url.String(), *res) - return probe.Success, body, nil + return probe.Success, body, body, nil } klog.V(4).Infof("Probe failed for %s with request headers %v, response body: %v", url.String(), headers, body) // Note: Until https://issue.k8s.io/99425 is addressed, this user-facing failure message must not contain the response body. + // @deads2k recommended we return the body. Slack discussion: https://redhat-internal.slack.com/archives/C04UQLWQAP3/p1679590747021409 failureMsg := fmt.Sprintf("HTTP probe failed with statuscode: %d", res.StatusCode) - return probe.Failure, failureMsg, nil + return probe.Failure, failureMsg, body, nil } // RedirectChecker returns a function that can be used to check HTTP redirects. diff --git a/pkg/probe/http/patch_http.go b/pkg/probe/http/patch_http.go new file mode 100644 index 0000000000000..71648a79f64b1 --- /dev/null +++ b/pkg/probe/http/patch_http.go @@ -0,0 +1,25 @@ +package http + +import ( + "net/http" + "time" + + "k8s.io/kubernetes/pkg/probe" +) + +// Prober is an interface that defines the Probe function for doing HTTP readiness/liveness checks. +type DetailedProber interface { + ProbeForBody(req *http.Request, timeout time.Duration) (probe.Result, string, string, error) +} + +// ProbeForBody returns a ProbeRunner capable of running an HTTP check. +// returns result, details, body, error +func (pr httpProber) ProbeForBody(req *http.Request, timeout time.Duration) (probe.Result, string, string, error) { + pr.transport.DisableCompression = true // removes Accept-Encoding header + client := &http.Client{ + Timeout: timeout, + Transport: pr.transport, + CheckRedirect: RedirectChecker(pr.followNonLocalRedirects), + } + return DoHTTPProbe(req, client) +}