From ff316c8589bb636fcb935515b37a4cf089a9e2b4 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 19 Sep 2019 00:57:23 +0200 Subject: [PATCH] client: Return empty values when host stats fail Currently, there is an issue when running on Windows whereby under some circumstances the Windows stats API's will begin to return errors (such as internal timeouts) when a client is under high load, and potentially other forms of resource contention / system states (and other unknown cases). When an error occurs during this collection, we then short circuit further metrics emission from the client until the next interval. This can be problematic if it happens for a sustained number of intervals, as our metrics aggregator will begin to age out older metrics, and we will eventually stop emitting various types of metrics including `nomad.client.unallocated.*` metrics. However, when metrics collection fails on Linux, gopsutil will in many cases (e.g cpu.Times) silently return 0 values, rather than an error. Here, we switch to returning empty metrics in these failures, and logging the error at the source. This brings the behaviour into line with Linux/Unix platforms, and although making aggregation a little sadder on intermittent failures, will result in more desireable overall behaviour of keeping metrics available for further investigation if things look unusual. --- client/allocrunner/taskrunner/task_runner.go | 4 ++++ client/client.go | 5 ++++- client/stats/host.go | 15 ++++++++++----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b609b..7cc7d6c9e3fc 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { if ru.ResourceUsage.MemoryStats != nil { tr.setGaugeForMemory(ru) + } else { + tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") } if ru.ResourceUsage.CpuStats != nil { tr.setGaugeForCPU(ru) + } else { + tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") } } diff --git a/client/client.go b/client/client.go index 6cc28fdfd194..98faa36525b1 100644 --- a/client/client.go +++ b/client/client.go @@ -2575,10 +2575,13 @@ func (c *Client) emitStats() { for { select { case <-next.C: + start := time.Now() err := c.hostStatsCollector.Collect() next.Reset(c.config.StatsCollectionInterval) + end := time.Now() + duration := end.Sub(start) if err != nil { - c.logger.Warn("error fetching host resource usage stats", "error", err) + c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration) continue } diff --git a/client/stats/host.go b/client/stats/host.go index 28d61906d711..7fd17cd599c6 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -117,21 +117,25 @@ func (h *HostStatsCollector) collectLocked() error { // Determine up-time uptime, err := host.Uptime() if err != nil { - return err + h.logger.Error("failed to collect upstime stats", "error", err) + uptime = 0 } hs.Uptime = uptime // Collect memory stats mstats, err := h.collectMemoryStats() if err != nil { - return err + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats cpus, ticks, err := h.collectCPUStats() if err != nil { - return err + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + ticks = 0 } hs.CPU = cpus hs.CPUTicksConsumed = ticks @@ -139,14 +143,15 @@ func (h *HostStatsCollector) collectLocked() error { // Collect disk stats diskStats, err := h.collectDiskStats() if err != nil { - return err + h.logger.Error("failed to collect disk stats", "error", err) + hs.DiskStats = []*DiskStats{} } hs.DiskStats = diskStats // Getting the disk stats for the allocation directory usage, err := disk.Usage(h.allocDir) if err != nil { - return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err) + h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err) } hs.AllocDirStats = h.toDiskStats(usage, nil)