diff --git a/docs/CONFIG-PROPERTIES.md b/docs/CONFIG-PROPERTIES.md index b7122386ec..ab67b33b33 100644 --- a/docs/CONFIG-PROPERTIES.md +++ b/docs/CONFIG-PROPERTIES.md @@ -64,6 +64,11 @@ | network.switch.enable.arpsnoop | boolean | true | enable ARP Snooping on switch Network Instances | | wwan.query.visible.providers | bool | false | enable to periodically (once per hour) query the set of visible cellular service providers and publish them under WirelessStatus (for every modem) | | network.local.legacy.mac.address | bool | false | enables legacy MAC address generation for local network instances for those EVE nodes where changing MAC addresses in applications will lead to incorrect network configuration | +| goroutine.leak.detection.threshold | integer | 5000 | Amount of goroutines, reaching which will trigger leak detection regardless of growth rate. | +| goroutine.leak.detection.check.interval.minutes | integer (minutes) | 1 | Interval in minutes between the measurements of the goroutine count. | +| goroutine.leak.detection.check.window.minutes | integer (minutes) | 10 | Interval in minutes for which the leak analysis is performed. It should contain at least 10 measurements, so no less than 10 × goroutine.leak.detection.check.interval.minutes. | +| goroutine.leak.detection.keep.stats.hours | integer (hours) | 24 | Amount of hours to keep the stats for leak detection. We keep more stats than the check window to be able to react to settings with a bigger check window via configuration. | +| goroutine.leak.detection.cooldown.minutes | integer (minutes) | 5 | Cooldown period in minutes after the leak detection is triggered. During this period, no stack traces are collected; only warning messages are logged. | In addition, there can be per-agent settings. The Per-agent settings begin with "agent.*agentname*.*setting*" diff --git a/pkg/pillar/cmd/watcher/watcher.go b/pkg/pillar/cmd/watcher/watcher.go index e9db68b136..be3f2abbf4 100644 --- a/pkg/pillar/cmd/watcher/watcher.go +++ b/pkg/pillar/cmd/watcher/watcher.go @@ -28,6 +28,73 @@ const ( usageThreshold = 2 ) +// GoroutineLeakDetectionParams holds the global goroutine leak detection parameters +type GoroutineLeakDetectionParams struct { + mutex sync.Mutex + threshold int + checkInterval time.Duration + checkStatsFor time.Duration + keepStatsFor time.Duration + cooldownPeriod time.Duration +} + +func validateGoroutineLeakDetectionParams(threshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) bool { + if threshold < 1 { + log.Warnf("Invalid threshold: %d", threshold) + return false + } + if checkInterval < 0 { + log.Warnf("Invalid check interval: %v", checkInterval) + return false + } + if checkStatsFor < checkInterval*10 { + log.Warnf("Invalid check window: %v", checkStatsFor) + log.Warnf("Check window must be at least 10 times the check interval (%v)", checkInterval) + return false + } + if keepStatsFor < checkStatsFor { + log.Warnf("Invalid keep stats duration: %v", keepStatsFor) + log.Warnf("Keep stats duration must be greater than a check window (%v)", checkStatsFor) + return false + } + if cooldownPeriod < checkInterval { + log.Warnf("Invalid cooldown period: %v", cooldownPeriod) + log.Warnf("Cooldown period must be greater than a check interval (%v)", checkInterval) + return false + } + return true +} + +// Set atomically sets the global goroutine leak detection parameters +func (gldp *GoroutineLeakDetectionParams) Set(threshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) { + if !validateGoroutineLeakDetectionParams(threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) { + return + } + gldp.mutex.Lock() + gldp.threshold = threshold + gldp.checkInterval = checkInterval + gldp.checkStatsFor = checkStatsFor + gldp.keepStatsFor = keepStatsFor + gldp.cooldownPeriod = cooldownPeriod + gldp.mutex.Unlock() +} + +// Get atomically gets the global goroutine leak detection parameters +func (gldp *GoroutineLeakDetectionParams) Get() (int, time.Duration, time.Duration, time.Duration, time.Duration) { + var threshold int + var checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration + + gldp.mutex.Lock() + threshold = gldp.threshold + checkInterval = gldp.checkInterval + checkStatsFor = gldp.checkStatsFor + keepStatsFor = gldp.keepStatsFor + cooldownPeriod = gldp.cooldownPeriod + gldp.mutex.Unlock() + + return threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod +} + type watcherContext struct { agentbase.AgentBase ps *pubsub.PubSub @@ -40,6 +107,9 @@ type watcherContext struct { GCInitialized bool // cli options + + // Global goroutine leak detection parameters + GRLDParams GoroutineLeakDetectionParams } // AddAgentSpecificCLIFlags adds CLI options @@ -84,6 +154,22 @@ func setForcedGOGCParams(ctx *watcherContext) { gogcForcedLock.Unlock() } +// Read the global goroutine leak detection parameters to the context +func readGlobalGoroutineLeakDetectionParams(ctx *watcherContext) { + gcp := agentlog.GetGlobalConfig(log, ctx.subGlobalConfig) + if gcp == nil { + return + } + + threshold := int(gcp.GlobalValueInt(types.GoroutineLeakDetectionThreshold)) + checkInterval := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCheckIntervalMinutes)) * time.Minute + checkStatsFor := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCheckWindowMinutes)) * time.Minute + keepStatsFor := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionKeepStatsHours)) * time.Hour + cooldownPeriod := time.Duration(gcp.GlobalValueInt(types.GoroutineLeakDetectionCooldownMinutes)) * time.Minute + + ctx.GRLDParams.Set(threshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) +} + // Listens to root cgroup in hierarchy mode (events always propagate // up to the root) and call Go garbage collector with reasonable // interval when certain amount of memory has been allocated (presumably @@ -249,12 +335,44 @@ func handlePotentialGoroutineLeak() { agentlog.DumpAllStacks(log, agentName) } -func goroutinesMonitor(goroutinesThreshold int, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod time.Duration) { +// goroutinesMonitor monitors the number of goroutines and detects potential goroutine leaks. +func goroutinesMonitor(ctx *watcherContext) { + // Get the initial goroutine leak detection parameters to create the stats slice + goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod := ctx.GRLDParams.Get() entriesToKeep := int(keepStatsFor / checkInterval) - entriesToCheck := int(checkStatsFor / checkInterval) stats := make([]int, 0, entriesToKeep+1) var lastLeakHandled time.Time for { + goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod = ctx.GRLDParams.Get() + // Check if we have to resize the stats slice + newEntriesToKeep := int(keepStatsFor / checkInterval) + if newEntriesToKeep != entriesToKeep { + if newEntriesToKeep > entriesToKeep { + // **Increasing Size** + if cap(stats) >= newEntriesToKeep+1 { + // Capacity is sufficient; update entriesToKeep + log.Functionf("Capacity sufficient; update entriesToKeep: %d", newEntriesToKeep) + entriesToKeep = newEntriesToKeep + } else { + // Capacity insufficient; create a new slice + log.Functionf("Capacity insufficient; create a new slice: %d", newEntriesToKeep+1) + newStats := make([]int, len(stats), newEntriesToKeep+1) + copy(newStats, stats) + stats = newStats + entriesToKeep = newEntriesToKeep + } + } else { + // **Decreasing Size** + if len(stats) > newEntriesToKeep { + // Remove oldest entries + log.Functionf("Remove oldest entries: %d", len(stats)-newEntriesToKeep) + stats = stats[len(stats)-newEntriesToKeep:] + } + log.Functionf("Update entriesToKeep: %d", newEntriesToKeep) + entriesToKeep = newEntriesToKeep + } + } + entriesToCheck := int(checkStatsFor / checkInterval) time.Sleep(checkInterval) numGoroutines := runtime.NumGoroutine() // First check for the threshold @@ -411,21 +529,7 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar // Handle memory pressure events by calling GC explicitly go handleMemoryPressureEvents() - // Detect goroutine leaks - const ( - // Threshold for the number of goroutines - // When reached, we assume there's a potential goroutine leak - goroutinesThreshold = 1000 - // Check interval for the number of goroutines - checkInterval = 1 * time.Minute - // Check window. On each check, we analyze the stats for that period - checkWindow = 1 * time.Hour - // Keep statistic for that long - keepStatsFor = 24 * time.Hour - // Cooldown period for the leak detection - cooldownPeriod = 10 * time.Minute - ) - go goroutinesMonitor(goroutinesThreshold, checkInterval, checkWindow, keepStatsFor, cooldownPeriod) + go goroutinesMonitor(&ctx) for { select { @@ -646,6 +750,8 @@ func handleGlobalConfigImpl(ctxArg interface{}, key string, if gcp != nil { ctx.GCInitialized = true } + // Update the global goroutine leak detection parameters + readGlobalGoroutineLeakDetectionParams(ctx) log.Functionf("handleGlobalConfigImpl done for %s", key) } diff --git a/pkg/pillar/cmd/watcher/watcher_test.go b/pkg/pillar/cmd/watcher/watcher_test.go index 352d6c7dbb..0799e07a8f 100644 --- a/pkg/pillar/cmd/watcher/watcher_test.go +++ b/pkg/pillar/cmd/watcher/watcher_test.go @@ -244,9 +244,11 @@ func TestGoroutinesMonitorNoLeak(t *testing.T) { r, w, _ := os.Pipe() logger.Out = w - go func() { - goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) - }() + // Create context with default parameters + ctx := &watcherContext{} + ctx.GRLDParams.Set(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) + + go goroutinesMonitor(ctx) var wg sync.WaitGroup @@ -298,9 +300,11 @@ func TestGoroutinesMonitorLeak(t *testing.T) { r, w, _ := os.Pipe() logger.Out = w - go func() { - goroutinesMonitor(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) - }() + // Create context with default parameters + ctx := &watcherContext{} + ctx.GRLDParams.Set(goroutinesThreshold, checkInterval, checkStatsFor, keepStatsFor, cooldownPeriod) + + go goroutinesMonitor(ctx) var wg sync.WaitGroup diff --git a/pkg/pillar/docs/watcher.md b/pkg/pillar/docs/watcher.md index f1d3ea89fc..86c2f79891 100644 --- a/pkg/pillar/docs/watcher.md +++ b/pkg/pillar/docs/watcher.md @@ -85,3 +85,8 @@ To prevent repeated handling of the same issue within a short time frame, we incorporate a cooldown period in the `goroutinesMonitor` function. This ensures that resources are not wasted on redundant operations and that the monitoring system remains efficient. + +The goroutine leak detector is dynamically configurable via global configuration +parameters. They are documented in the +[CONFIG-PROPERTIES.md](../../../docs/CONFIG-PROPERTIES.md) and all have +`goroutine.leak.detection` prefix. diff --git a/pkg/pillar/types/global.go b/pkg/pillar/types/global.go index 9b33134f48..533e208dc2 100644 --- a/pkg/pillar/types/global.go +++ b/pkg/pillar/types/global.go @@ -257,6 +257,22 @@ const ( // WwanQueryVisibleProviders : periodically query visible cellular service providers WwanQueryVisibleProviders GlobalSettingKey = "wwan.query.visible.providers" + // GoroutineLeakDetectionThreshold amount of goroutines, reaching which will trigger leak detection + // regardless of growth rate. + GoroutineLeakDetectionThreshold GlobalSettingKey = "goroutine.leak.detection.threshold" + // GoroutineLeakDetectionCheckIntervalMinutes interval in minutes between the measurements of the + // goroutine count. + GoroutineLeakDetectionCheckIntervalMinutes GlobalSettingKey = "goroutine.leak.detection.check.interval.minutes" + // GoroutineLeakDetectionCheckWindowMinutes interval in minutes for which the leak analysis is performed. + // It should contain at least 10 measurements, so no less than 10 * GoroutineLeakDetectionCheckIntervalMinutes. + GoroutineLeakDetectionCheckWindowMinutes GlobalSettingKey = "goroutine.leak.detection.check.window.minutes" + // GoroutineLeakDetectionKeepStatsHours amount of hours to keep the stats for the leak detection. We keep more + // stats than the check window to be able to react to settings a bigger check window via configuration. + GoroutineLeakDetectionKeepStatsHours GlobalSettingKey = "goroutine.leak.detection.keep.stats.hours" + // GoroutineLeakDetectionCooldownMinutes cooldown period in minutes after the leak detection is triggered. During + // this period no stack traces are collected, only warning messages are logged. + GoroutineLeakDetectionCooldownMinutes GlobalSettingKey = "goroutine.leak.detection.cooldown.minutes" + // TriState Items // NetworkFallbackAnyEth global setting key NetworkFallbackAnyEth GlobalSettingKey = "network.fallback.any.eth" @@ -933,6 +949,13 @@ func NewConfigItemSpecMap() ConfigItemSpecMap { configItemSpecMap.AddIntItem(LogRemainToSendMBytes, 2048, 10, 0xFFFFFFFF) configItemSpecMap.AddIntItem(DownloadMaxPortCost, 0, 0, 255) + // Goroutine Leak Detection section + configItemSpecMap.AddIntItem(GoroutineLeakDetectionThreshold, 5000, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCheckIntervalMinutes, 1, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCheckWindowMinutes, 10, 10, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionKeepStatsHours, 24, 1, 0xFFFFFFFF) + configItemSpecMap.AddIntItem(GoroutineLeakDetectionCooldownMinutes, 5, 1, 0xFFFFFFFF) + // Add Bool Items configItemSpecMap.AddBoolItem(UsbAccess, true) // Controller likely default to false configItemSpecMap.AddBoolItem(VgaAccess, true) // Controller likely default to false diff --git a/pkg/pillar/types/global_test.go b/pkg/pillar/types/global_test.go index e7325cd898..444b41917c 100644 --- a/pkg/pillar/types/global_test.go +++ b/pkg/pillar/types/global_test.go @@ -215,6 +215,11 @@ func TestNewConfigItemSpecMap(t *testing.T) { NetDumpTopicPostOnboardInterval, NetDumpDownloaderPCAP, NetDumpDownloaderHTTPWithFieldValue, + GoroutineLeakDetectionThreshold, + GoroutineLeakDetectionCheckIntervalMinutes, + GoroutineLeakDetectionCheckWindowMinutes, + GoroutineLeakDetectionKeepStatsHours, + GoroutineLeakDetectionCooldownMinutes, } if len(specMap.GlobalSettings) != len(gsKeys) { t.Errorf("GlobalSettings has more (%d) than expected keys (%d)",