feat: make collector health check timeout configurable (#1371)

## Which problem is this PR solving? When there's a big volume of traffic, collector can take longer to respond to the health check ## Short description of the changes - add a new config option `HealthCheckTimeout` in `Collection` --------- Co-authored-by: Tyler Helmuth <12352919+TylerHelmuth@users.noreply.github.com>
honeycombio · Oct 10, 2024 · 7bdc0a7 · 7bdc0a7
1 parent 43ba75a
commit 7bdc0a7
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 3 deletions.
diff --git a/collect/collect.go b/collect/collect.go
@@ -141,7 +141,7 @@ func (i *InMemCollector) Start() error {
 	// listen for config reloads
 	i.Config.RegisterReloadCallback(i.sendReloadSignal)
 
-	i.Health.Register(CollectorHealthKey, 3*time.Second)
+	i.Health.Register(CollectorHealthKey, time.Duration(imcConfig.HealthCheckTimeout))
 
 	for _, metric := range inMemCollectorMetrics {
 		i.Metrics.Register(metric)
@@ -339,6 +339,8 @@ func (i *InMemCollector) collect() {
 	defer i.mutex.Unlock()
 
 	for {
+		startTime := time.Now()
+
 		i.Health.Ready(CollectorHealthKey, true)
 		// record channel lengths as histogram but also as gauges
 		i.Metrics.Histogram("collector_incoming_queue", float64(len(i.incoming)))
@@ -385,18 +387,18 @@ func (i *InMemCollector) collect() {
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case sp, ok := <-i.fromPeer:
 				if !ok {
 					// channel's been closed; we should shut down.
 					return
 				}
 				i.processSpan(sp)
-				continue
 			case <-i.reload:
 				i.reloadConfigs()
 			}
 		}
+
+		i.Metrics.Gauge("collector_collect_loop_duration_ms", float64(time.Now().Sub(startTime).Milliseconds()))
 	}
 }
 

diff --git a/config/file_config.go b/config/file_config.go
@@ -302,6 +302,7 @@ type CollectionConfig struct {
 	PeerQueueSize         int        `yaml:"PeerQueueSize"`
 	IncomingQueueSize     int        `yaml:"IncomingQueueSize"`
 	AvailableMemory       MemorySize `yaml:"AvailableMemory" cmdenv:"AvailableMemory"`
+	HealthCheckTimeout    Duration   `yaml:"HealthCheckTimeout" default:"3s"`
 	MaxMemoryPercentage   int        `yaml:"MaxMemoryPercentage" default:"75"`
 	MaxAlloc              MemorySize `yaml:"MaxAlloc"`
 	DisableRedistribution bool       `yaml:"DisableRedistribution"`

diff --git a/config/metadata/configMeta.yaml b/config/metadata/configMeta.yaml
@@ -1296,6 +1296,16 @@ groups:
         description: >
           If `true`, Refinery's will route all spans that belongs to the same trace to a single peer.
 
+      - name: HealthCheckTimeout
+        type: duration
+        valuetype: nondefault
+        firstversion: v2.8
+        default: 3s
+        reload: false
+        summary: Controls the maximum duration allowed for collection health checks to complete.
+        description: >
+          The `HealthCheckTimeout` setting specifies the maximum duration allowed for the health checks of the collection subsystems to complete. If a subsystem does not respond within this timeout period, it will be marked as unhealthy. This timeout value should be set carefully to ensure that transient delays do not lead to unnecessary failure detection while still allowing for timely identification of actual health issues.
+
   - name: BufferSizes
     title: "Buffer Sizes"
     description: >