Skip to content

Commit

Permalink
client: Return empty values when host stats fail
Browse files Browse the repository at this point in the history
Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).

When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.

This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.

However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.

Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
  • Loading branch information
endocrimes committed Sep 18, 2019
1 parent 8602b12 commit ff316c8
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 6 deletions.
4 changes: 4 additions & 0 deletions client/allocrunner/taskrunner/task_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {

if ru.ResourceUsage.MemoryStats != nil {
tr.setGaugeForMemory(ru)
} else {
tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil")
}

if ru.ResourceUsage.CpuStats != nil {
tr.setGaugeForCPU(ru)
} else {
tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil")
}
}

Expand Down
5 changes: 4 additions & 1 deletion client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2575,10 +2575,13 @@ func (c *Client) emitStats() {
for {
select {
case <-next.C:
start := time.Now()
err := c.hostStatsCollector.Collect()
next.Reset(c.config.StatsCollectionInterval)
end := time.Now()
duration := end.Sub(start)
if err != nil {
c.logger.Warn("error fetching host resource usage stats", "error", err)
c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration)
continue
}

Expand Down
15 changes: 10 additions & 5 deletions client/stats/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,36 +117,41 @@ func (h *HostStatsCollector) collectLocked() error {
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
return err
h.logger.Error("failed to collect upstime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime

// Collect memory stats
mstats, err := h.collectMemoryStats()
if err != nil {
return err
h.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats

// Collect cpu stats
cpus, ticks, err := h.collectCPUStats()
if err != nil {
return err
h.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
ticks = 0
}
hs.CPU = cpus
hs.CPUTicksConsumed = ticks

// Collect disk stats
diskStats, err := h.collectDiskStats()
if err != nil {
return err
h.logger.Error("failed to collect disk stats", "error", err)
hs.DiskStats = []*DiskStats{}
}
hs.DiskStats = diskStats

// Getting the disk stats for the allocation directory
usage, err := disk.Usage(h.allocDir)
if err != nil {
return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err)
h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err)
}
hs.AllocDirStats = h.toDiskStats(usage, nil)

Expand Down

0 comments on commit ff316c8

Please sign in to comment.