From 9a7dbc8e867321a00f2b9d71670f9110ee59134c Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 19 Sep 2019 00:57:23 +0200 Subject: [PATCH] client: Return empty values when host stats fail Currently, there is an issue when running on Windows whereby under some circumstances the Windows stats API's will begin to return errors (such as internal timeouts) when a client is under high load, and potentially other forms of resource contention / system states (and other unknown cases). When an error occurs during this collection, we then short circuit further metrics emission from the client until the next interval. This can be problematic if it happens for a sustained number of intervals, as our metrics aggregator will begin to age out older metrics, and we will eventually stop emitting various types of metrics including `nomad.client.unallocated.*` metrics. However, when metrics collection fails on Linux, gopsutil will in many cases (e.g cpu.Times) silently return 0 values, rather than an error. Here, we switch to returning empty metrics in these failures, and logging the error at the source. This brings the behaviour into line with Linux/Unix platforms, and although making aggregation a little sadder on intermittent failures, will result in more desireable overall behaviour of keeping metrics available for further investigation if things look unusual. --- client/allocrunner/taskrunner/task_runner.go | 4 ++++ client/client.go | 5 ++++- client/stats/host.go | 16 ++++++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b609b..7cc7d6c9e3fc 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { if ru.ResourceUsage.MemoryStats != nil { tr.setGaugeForMemory(ru) + } else { + tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") } if ru.ResourceUsage.CpuStats != nil { tr.setGaugeForCPU(ru) + } else { + tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") } } diff --git a/client/client.go b/client/client.go index 6cc28fdfd194..98faa36525b1 100644 --- a/client/client.go +++ b/client/client.go @@ -2575,10 +2575,13 @@ func (c *Client) emitStats() { for { select { case <-next.C: + start := time.Now() err := c.hostStatsCollector.Collect() next.Reset(c.config.StatsCollectionInterval) + end := time.Now() + duration := end.Sub(start) if err != nil { - c.logger.Warn("error fetching host resource usage stats", "error", err) + c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration) continue } diff --git a/client/stats/host.go b/client/stats/host.go index 28d61906d711..04bf5b65b0e1 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -1,7 +1,6 @@ package stats import ( - "fmt" "math" "runtime" "sync" @@ -117,21 +116,25 @@ func (h *HostStatsCollector) collectLocked() error { // Determine up-time uptime, err := host.Uptime() if err != nil { - return err + h.logger.Error("failed to collect upstime stats", "error", err) + uptime = 0 } hs.Uptime = uptime // Collect memory stats mstats, err := h.collectMemoryStats() if err != nil { - return err + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats cpus, ticks, err := h.collectCPUStats() if err != nil { - return err + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + ticks = 0 } hs.CPU = cpus hs.CPUTicksConsumed = ticks @@ -139,14 +142,15 @@ func (h *HostStatsCollector) collectLocked() error { // Collect disk stats diskStats, err := h.collectDiskStats() if err != nil { - return err + h.logger.Error("failed to collect disk stats", "error", err) + hs.DiskStats = []*DiskStats{} } hs.DiskStats = diskStats // Getting the disk stats for the allocation directory usage, err := disk.Usage(h.allocDir) if err != nil { - return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err) + h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err) } hs.AllocDirStats = h.toDiskStats(usage, nil)