From cd7fafc69e8dd807ddc7a4c74e094c6cb75a8dee Mon Sep 17 00:00:00 2001 From: Nikolay Martyanov Date: Thu, 7 Nov 2024 19:52:27 +0100 Subject: [PATCH] pillar/watcher: Make goroutine leak detector configurable. Added new configuration options for the goroutine leak detector, enabling adjustment of parameters such as threshold, check interval, analysis window, stats retention, and cooldown period through global settings. This enhancement allows for dynamic tuning of leak detection based on deployment needs without code modification. Introduced `GoroutineLeakDetectionParams` struct to manage these settings, along with validation to ensure parameters meet minimum requirements. Updated the `watcher` component to use this configurable setup, and integrated new global config keys to hold these values. Signed-off-by: Nikolay Martyanov --- docs/CONFIG-PROPERTIES.md | 5 + pkg/pillar/cmd/watcher/watcher.go | 140 ++++++++++++++++++++++--- pkg/pillar/cmd/watcher/watcher_test.go | 16 +-- pkg/pillar/docs/watcher.md | 5 + pkg/pillar/types/global.go | 23 ++++ pkg/pillar/types/global_test.go | 5 + 6 files changed, 171 insertions(+), 23 deletions(-) diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index b7122386ec..ab67b33b33 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -64,6 +64,11 @@ | network.switch.enable.arpsnoop | boolean | true | enable ARP Snooping on switch Network Instances | | wwan.query.visible.providers | bool | false | enable to periodically (once per hour) query the set of visible cellular service providers and publish them under WirelessStatus (for every modem) | | network.local.legacy.mac.address | bool | false | enables legacy MAC address generation for local network instances for those EVE nodes where changing MAC addresses in applications will lead to incorrect network configuration | +| goroutine.leak.detection.threshold | integer | 5000 | Amount of goroutines, reaching which will trigger leak detection regardless of growth rate. | +| goroutine.leak.detection.check.interval.minutes | integer (minutes) | 1 | Interval in minutes between the measurements of the goroutine count. | +| goroutine.leak.detection.check.window.minutes | integer (minutes) | 10 | Interval in minutes for which the leak analysis is performed. It should contain at least 10 measurements, so no less than 10 × goroutine.leak.detection.check.interval.minutes. | +| goroutine.leak.detection.keep.stats.hours | integer (hours) | 24 | Amount of hours to keep the stats for leak detection. We keep more stats than the check window to be able to react to settings with a bigger check window via configuration. | +| goroutine.leak.detection.cooldown.minutes | integer (minutes) | 5 | Cooldown period in minutes after the leak detection is triggered. During this period, no stack traces are collected; only warning messages are logged. | In addition, there can be per-agent settings. The Per-agent settings begin with "agent.*agentname*.*setting*" diff --git a/pkg/pillar/cmd/watcher/watcher.go b/pkg/pillar/cmd/watcher/watcher.go index e9db68b136..be3f2abbf4 100644 --- a/pkg/pillar/cmd/watcher/watcher.go +++ b/pkg/pillar/cmd/watcher/watcher.go @@ -28,6 +28,73 @@ const ( usageThreshold = 2 ) +// GoroutineLeakDetectionParams holds the global goroutine leak detection parameters +type GoroutineLeakDetectionParams struct { + mutex sync.Mutex + threshold int + checkInterval time.Duration + checkStatsFor time.Duration + keepStatsFor time.Duration + cooldownPeriod time.Duration +} + +func validateGoroutineLeakDetectionParams(threshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) bool { + if threshold < 1 { + log.Warnf("Invalid threshold: %d", threshold) + return false + } + if checkInterval < 0 { + log.Warnf("Invalid check interval: %v", checkInterval) + return false + } + if checkStatsFor < checkInterval*10 { + log.Warnf("Invalid check window: %v", checkStatsFor) + log.Warnf("Check window must be at least 10 times the check interval (%v)", checkInterval) + return false + } + if keepStatsFor < checkStatsFor { + log.Warnf("Invalid keep stats duration: %v", keepStatsFor) + log.Warnf("Keep stats duration must be greater than a check window (%v)", checkStatsFor) + return false + } + if cooldownPeriod < checkInterval { + log.Warnf("Invalid cooldown period: %v", cooldownPeriod) + log.Warnf("Cooldown period must be greater than a check interval (%v)", checkInterval) + return false + } + return true +} + +// Set atomically sets the global goroutine leak detection parameters +func (gldp *GoroutineLeakDetectionParams) Set(threshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) { + if !validateGoroutineLeakDetectionParams(threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) { + return + } + gldp.mutex.Lock() + gldp.threshold = threshold + gldp.checkInterval = checkInterval + gldp.checkStatsFor = checkStatsFor + gldp.keepStatsFor = keepStatsFor + gldp.cooldownPeriod = cooldownPeriod + gldp.mutex.Unlock() +} + +// Get atomically gets the global goroutine leak detection parameters +func (gldp *GoroutineLeakDetectionParams) Get() (int, time.Duration, time.Duration, time.Duration, time.Duration) { + var threshold int + var checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration + + gldp.mutex.Lock() + threshold = gldp.threshold + checkInterval = gldp.checkInterval + checkStatsFor = gldp.checkStatsFor + keepStatsFor = gldp.keepStatsFor + cooldownPeriod = gldp.cooldownPeriod + gldp.mutex.Unlock() + + return threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod +} + type watcherContext struct { agentbase.AgentBase ps *pubsub.PubSub @@ -40,6 +107,9 @@ type watcherContext struct { GCInitialized bool // cli options + + // Global goroutine leak detection parameters + GRLDParams GoroutineLeakDetectionParams } // AddAgentSpecificCLIFlags adds CLI options @@ -84,6 +154,22 @@ func setForcedGOGCParams(ctx *watcherContext) { gogcForcedLock.Unlock() } +// Read the global goroutine leak detection parameters to the context +func readGlobalGoroutineLeakDetectionParams(ctx *watcherContext) { + gcp := agentlog.GetGlobalConfig(log, ctx.subGlobalConfig) + if gcp == nil { + return + } + + threshold := int(gcp.GlobalValueInt(types.GoroutineLeakDetectionThreshold)) + checkInterval := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCheckIntervalMinutes)) * time.Minute + checkStatsFor := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCheckWindowMinutes)) * time.Minute + keepStatsFor := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionKeepStatsHours)) * time.Hour + cooldownPeriod := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCooldownMinutes)) * time.Minute + + ctx.GRLDParams.Set(threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) +} + // Listens to root cgroup in hierarchy mode (events always propagate // up to the root) and call Go garbage collector with reasonable // interval when certain amount of memory has been allocated (presumably @@ -249,12 +335,44 @@ func handlePotentialGoroutineLeak() { agentlog.DumpAllStacks(log, agentName) } -func goroutinesMonitor(goroutinesThreshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) { +// goroutinesMonitor monitors the number of goroutines and detects potential goroutine leaks. +func goroutinesMonitor(ctx *watcherContext) { + // Get the initial goroutine leak detection parameters to create the stats slice + goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod := ctx.GRLDParams.Get() entriesToKeep := int(keepStatsFor / checkInterval) - entriesToCheck := int(checkStatsFor / checkInterval) stats := make([]int, 0, entriesToKeep+1) var lastLeakHandled time.Time for { + goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod = ctx.GRLDParams.Get() + // Check if we have to resize the stats slice + newEntriesToKeep := int(keepStatsFor / checkInterval) + if newEntriesToKeep != entriesToKeep { + if newEntriesToKeep > entriesToKeep { + // **Increasing Size** + if cap(stats) >= newEntriesToKeep+1 { + // Capacity is sufficient; update entriesToKeep + log.Functionf("Capacity sufficient; update entriesToKeep: %d", newEntriesToKeep) + entriesToKeep = newEntriesToKeep + } else { + // Capacity insufficient; create a new slice + log.Functionf("Capacity insufficient; create a new slice: %d", newEntriesToKeep+1) + newStats := make([]int, len(stats), newEntriesToKeep+1) + copy(newStats, stats) + stats = newStats + entriesToKeep = newEntriesToKeep + } + } else { + // **Decreasing Size** + if len(stats) > newEntriesToKeep { + // Remove oldest entries + log.Functionf("Remove oldest entries: %d", len(stats)-newEntriesToKeep) + stats = stats[len(stats)-newEntriesToKeep:] + } + log.Functionf("Update entriesToKeep: %d", newEntriesToKeep) + entriesToKeep = newEntriesToKeep + } + } + entriesToCheck := int(checkStatsFor / checkInterval) time.Sleep(checkInterval) numGoroutines := runtime.NumGoroutine() // First check for the threshold @@ -411,21 +529,7 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar // Handle memory pressure events by calling GC explicitly go handleMemoryPressureEvents() - // Detect goroutine leaks - const ( - // Threshold for the number of goroutines - // When reached, we assume there's a potential goroutine leak - goroutinesThreshold = 1000 - // Check interval for the number of goroutines - checkInterval = 1 * time.Minute - // Check window. On each check, we analyze the stats for that period - checkWindow = 1 * time.Hour - // Keep statistic for that long - keepStatsFor = 24 * time.Hour - // Cooldown period for the leak detection - cooldownPeriod = 10 * time.Minute - ) - go goroutinesMonitor(goroutinesThreshold, checkInterval, checkWindow, keepStatsFor, cooldownPeriod) + go goroutinesMonitor(&ctx) for { select { @@ -646,6 +750,8 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, if gcp != nil { ctx.GCInitialized = true } + // Update the global goroutine leak detection parameters + readGlobalGoroutineLeakDetectionParams(ctx) log.Functionf("handleGlobalConfigImpl done for %s", key) } diff --git a/pkg/pillar/cmd/watcher/watcher_test.go b/pkg/pillar/cmd/watcher/watcher_test.go index 352d6c7dbb..0799e07a8f 100644 --- a/pkg/pillar/cmd/watcher/watcher_test.go +++ b/pkg/pillar/cmd/watcher/watcher_test.go @@ -244,9 +244,11 @@ func TestGoroutinesMonitorNoLeak(t *testing.T) { r, w, _ := os.Pipe() logger.Out = w - go func() { - goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) - }() + // Create context with default parameters + ctx := &watcherContext{} + ctx.GRLDParams.Set(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) + + go goroutinesMonitor(ctx) var wg sync.WaitGroup @@ -298,9 +300,11 @@ func TestGoroutinesMonitorLeak(t *testing.T) { r, w, _ := os.Pipe() logger.Out = w - go func() { - goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) - }() + // Create context with default parameters + ctx := &watcherContext{} + ctx.GRLDParams.Set(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) + + go goroutinesMonitor(ctx) var wg sync.WaitGroup diff --git a/pkg/pillar/docs/watcher.md b/pkg/pillar/docs/watcher.md index f1d3ea89fc..86c2f79891 100644 --- a/pkg/pillar/docs/watcher.md +++ b/pkg/pillar/docs/watcher.md @@ -85,3 +85,8 @@ To prevent repeated handling of the same issue within a short time frame, we incorporate a cooldown period in the `goroutinesMonitor` function. This ensures that resources are not wasted on redundant operations and that the monitoring system remains efficient. + +The goroutine leak detector is dynamically configurable via global configuration +parameters. They are documented in the +[CONFIG-PROPERTIES.md](../../../docs/CONFIG-PROPERTIES.md) and all have +`goroutine.leak.detection` prefix. diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 9b33134f48..533e208dc2 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -257,6 +257,22 @@ const ( // WwanQueryVisibleProviders : periodically query visible cellular service providers WwanQueryVisibleProviders GlobalSettingKey = "wwan.query.visible.providers" + // GoroutineLeakDetectionThreshold amount of goroutines, reaching which will trigger leak detection + // regardless of growth rate. + GoroutineLeakDetectionThreshold GlobalSettingKey = "goroutine.leak.detection.threshold" + // GoroutineLeakDetectionCheckIntervalMinutes interval in minutes between the measurements of the + // goroutine count. + GoroutineLeakDetectionCheckIntervalMinutes GlobalSettingKey = "goroutine.leak.detection.check.interval.minutes" + // GoroutineLeakDetectionCheckWindowMinutes interval in minutes for which the leak analysis is performed. + // It should contain at least 10 measurements, so no less than 10 * GoroutineLeakDetectionCheckIntervalMinutes. + GoroutineLeakDetectionCheckWindowMinutes GlobalSettingKey = "goroutine.leak.detection.check.window.minutes" + // GoroutineLeakDetectionKeepStatsHours amount of hours to keep the stats for the leak detection. We keep more + // stats than the check window to be able to react to settings a bigger check window via configuration. + GoroutineLeakDetectionKeepStatsHours GlobalSettingKey = "goroutine.leak.detection.keep.stats.hours" + // GoroutineLeakDetectionCooldownMinutes cooldown period in minutes after the leak detection is triggered. During + // this period no stack traces are collected, only warning messages are logged. + GoroutineLeakDetectionCooldownMinutes GlobalSettingKey = "goroutine.leak.detection.cooldown.minutes" + // TriState Items // NetworkFallbackAnyEth global setting key NetworkFallbackAnyEth GlobalSettingKey = "network.fallback.any.eth" @@ -933,6 +949,13 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF) configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255) + // Goroutine Leak Detection section + configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCheckIntervalMinutes, 1, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCheckWindowMinutes, 10, 10, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionKeepStatsHours, 24, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCooldownMinutes, 5, 1, 0xFFFFFFFF) + // Add Bool Items configItemSpecMap.AddBoolItem(UsbAccess, true) // Controller likely default to false configItemSpecMap.AddBoolItem(VgaAccess, true) // Controller likely default to false diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index e7325cd898..444b41917c 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -215,6 +215,11 @@ func TestNewConfigItemSpecMap(t *testing.T) { NetDumpTopicPostOnboardInterval, NetDumpDownloaderPCAP, NetDumpDownloaderHTTPWithFieldValue, + GoroutineLeakDetectionThreshold, + GoroutineLeakDetectionCheckIntervalMinutes, + GoroutineLeakDetectionCheckWindowMinutes, + GoroutineLeakDetectionKeepStatsHours, + GoroutineLeakDetectionCooldownMinutes, } if len(specMap.GlobalSettings) != len(gsKeys) { t.Errorf("GlobalSettings has more (%d) than expected keys (%d)",