Skip to content

Commit

Permalink
feat: make collector health check timeout configurable (#1371)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

When there's a big volume of traffic, collector can take longer to
respond to the health check

## Short description of the changes

- add a new config option `HealthCheckTimeout` in `Collection`

---------

Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com>
  • Loading branch information
VinozzZ and TylerHelmuth authored Oct 10, 2024
1 parent 43ba75a commit 7bdc0a7
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
8 changes: 5 additions & 3 deletions collect/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ func (i *InMemCollector) Start() error {
// listen for config reloads
i.Config.RegisterReloadCallback(i.sendReloadSignal)

i.Health.Register(CollectorHealthKey, 3*time.Second)
i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))

for _, metric := range inMemCollectorMetrics {
i.Metrics.Register(metric)
Expand Down Expand Up @@ -339,6 +339,8 @@ func (i *InMemCollector) collect() {
defer i.mutex.Unlock()

for {
startTime := time.Now()

i.Health.Ready(CollectorHealthKey, true)
// record channel lengths as histogram but also as gauges
i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
Expand Down Expand Up @@ -385,18 +387,18 @@ func (i *InMemCollector) collect() {
return
}
i.processSpan(sp)
continue
case sp, ok := <-i.fromPeer:
if !ok {
// channel's been closed; we should shut down.
return
}
i.processSpan(sp)
continue
case <-i.reload:
i.reloadConfigs()
}
}

i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
}
}

Expand Down
1 change: 1 addition & 0 deletions config/file_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ type CollectionConfig struct {
PeerQueueSize int `yaml:"PeerQueueSize"`
IncomingQueueSize int `yaml:"IncomingQueueSize"`
AvailableMemory MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
HealthCheckTimeout Duration `yaml:"HealthCheckTimeout" default:"3s"`
MaxMemoryPercentage int `yaml:"MaxMemoryPercentage" default:"75"`
MaxAlloc MemorySize `yaml:"MaxAlloc"`
DisableRedistribution bool `yaml:"DisableRedistribution"`
Expand Down
10 changes: 10 additions & 0 deletions config/metadata/configMeta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,16 @@ groups:
description: >
If `true`, Refinery's will route all spans that belongs to the same trace to a single peer.
- name: HealthCheckTimeout
type: duration
valuetype: nondefault
firstversion: v2.8
default: 3s
reload: false
summary: Controls the maximum duration allowed for collection health checks to complete.
description: >
The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
- name: BufferSizes
title: "Buffer Sizes"
description: >
Expand Down

0 comments on commit 7bdc0a7

Please sign in to comment.