diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b609b..7cc7d6c9e3fc 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { if ru.ResourceUsage.MemoryStats != nil { tr.setGaugeForMemory(ru) + } else { + tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") } if ru.ResourceUsage.CpuStats != nil { tr.setGaugeForCPU(ru) + } else { + tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") } } diff --git a/client/client.go b/client/client.go index 6cc28fdfd194..98faa36525b1 100644 --- a/client/client.go +++ b/client/client.go @@ -2575,10 +2575,13 @@ func (c *Client) emitStats() { for { select { case <-next.C: + start := time.Now() err := c.hostStatsCollector.Collect() next.Reset(c.config.StatsCollectionInterval) + end := time.Now() + duration := end.Sub(start) if err != nil { - c.logger.Warn("error fetching host resource usage stats", "error", err) + c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration) continue } diff --git a/client/stats/host.go b/client/stats/host.go index 28d61906d711..04bf5b65b0e1 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -1,7 +1,6 @@ package stats import ( - "fmt" "math" "runtime" "sync" @@ -117,21 +116,25 @@ func (h *HostStatsCollector) collectLocked() error { // Determine up-time uptime, err := host.Uptime() if err != nil { - return err + h.logger.Error("failed to collect upstime stats", "error", err) + uptime = 0 } hs.Uptime = uptime // Collect memory stats mstats, err := h.collectMemoryStats() if err != nil { - return err + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats cpus, ticks, err := h.collectCPUStats() if err != nil { - return err + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + ticks = 0 } hs.CPU = cpus hs.CPUTicksConsumed = ticks @@ -139,14 +142,15 @@ func (h *HostStatsCollector) collectLocked() error { // Collect disk stats diskStats, err := h.collectDiskStats() if err != nil { - return err + h.logger.Error("failed to collect disk stats", "error", err) + hs.DiskStats = []*DiskStats{} } hs.DiskStats = diskStats // Getting the disk stats for the allocation directory usage, err := disk.Usage(h.allocDir) if err != nil { - return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err) + h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err) } hs.AllocDirStats = h.toDiskStats(usage, nil)