Skip to content

Commit

Permalink
cr: refactor sidecar prober logic
Browse files Browse the repository at this point in the history
  • Loading branch information
FUSAKLA committed Aug 14, 2019
1 parent 5ff4082 commit 3eaf82a
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 9 deletions.
6 changes: 3 additions & 3 deletions cmd/thanos/compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ func runCompact(

downsampleMetrics := newDownsampleMetrics(reg)

readinessProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
statusProber := prober.NewProber(component, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
// Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes.
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil {
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil {
return errors.Wrap(err, "create readiness prober")
}

Expand Down Expand Up @@ -326,7 +326,7 @@ func runCompact(
})

level.Info(logger).Log("msg", "starting compact node")
readinessProber.SetReady()
statusProber.SetReady()
return nil
}

Expand Down
28 changes: 22 additions & 6 deletions cmd/thanos/sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ func runSidecar(
uploads = false
}

readinessProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
statusProber := prober.NewProber(comp, logger, prometheus.WrapRegistererWithPrefix("thanos_", reg))
// Initiate default HTTP listener providing metrics endpoint and readiness/liveness probes.
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, readinessProber); err != nil {
if err := defaultHTTPListener(g, logger, reg, httpBindAddr, statusProber); err != nil {
return errors.Wrap(err, "create readiness prober")
}

Expand All @@ -148,6 +148,12 @@ func runSidecar(
}
}

// When the heartbeat to Prometheus fails, the sidecar is marked as not ready.
// But after `heartbeatFailLimit` number of consequential fails it's marked also not healthy,
// so the orchestrator (if any) can try restarting it if it would help.
heartbeatFailCount := 0
heartbeatFailLimit := 6

// Blocking query of external labels before joining as a Source Peer into gossip.
// We retry infinitely until we reach and fetch labels from our Prometheus.
err := runutil.Retry(2*time.Second, ctx.Done(), func() error {
Expand All @@ -157,7 +163,11 @@ func runSidecar(
"err", err,
)
promUp.Set(0)
readinessProber.SetNotReady(err)
statusProber.SetNotReady(err)
if heartbeatFailCount >= heartbeatFailLimit {
statusProber.SetNotHealthy(err)
}
heartbeatFailCount++
return err
}

Expand All @@ -166,7 +176,8 @@ func runSidecar(
"external_labels", m.Labels().String(),
)
promUp.Set(1)
readinessProber.SetReady()
statusProber.SetReady()
heartbeatFailCount = 0
lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9)
return nil
})
Expand All @@ -187,10 +198,15 @@ func runSidecar(
if err := m.UpdateLabels(iterCtx, logger); err != nil {
level.Warn(logger).Log("msg", "heartbeat failed", "err", err)
promUp.Set(0)
readinessProber.SetNotReady(err)
statusProber.SetNotReady(err)
if heartbeatFailCount >= heartbeatFailLimit {
statusProber.SetNotHealthy(err)
}
heartbeatFailCount++
} else {
promUp.Set(1)
readinessProber.SetReady()
statusProber.SetReady()
heartbeatFailCount = 0
lastHeartbeat.Set(float64(time.Now().UnixNano()) / 1e9)
}

Expand Down
13 changes: 13 additions & 0 deletions pkg/prober/prober.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ type Prober struct {
func NewProber(component component.Component, logger log.Logger, reg prometheus.Registerer) *Prober {
initialErr := fmt.Errorf(initialErrorFmt, component)

// From Kubernetes documentation https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/ :
//
// liveness: Many applications running for long periods of time eventually transition to broken states,
// (healthy) and cannot recover except by being restarted.
// Kubernetes provides liveness probes to detect and remedy such situations.
//
// readiness: Sometimes, applications are temporarily unable to serve traffic.
// (ready) For example, an application might need to load large data or configuration files during startup,
// or depend on external services after startup. In such cases, you don’t want to kill the application,
// but you don’t want to send it requests either. Kubernetes provides readiness probes to detect
// and mitigate these situations. A pod with containers reporting that they are not ready
// does not receive traffic through Kubernetes Services.

p := &Prober{
component: component,
logger: logger,
Expand Down

0 comments on commit 3eaf82a

Please sign in to comment.