diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b609b..7cc7d6c9e3fc 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { if ru.ResourceUsage.MemoryStats != nil { tr.setGaugeForMemory(ru) + } else { + tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") } if ru.ResourceUsage.CpuStats != nil { tr.setGaugeForCPU(ru) + } else { + tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") } } diff --git a/client/client.go b/client/client.go index 6cc28fdfd194..fe55e04860c2 100644 --- a/client/client.go +++ b/client/client.go @@ -2579,12 +2579,11 @@ func (c *Client) emitStats() { next.Reset(c.config.StatsCollectionInterval) if err != nil { c.logger.Warn("error fetching host resource usage stats", "error", err) - continue - } - - // Publish Node metrics if operator has opted in - if c.config.PublishNodeMetrics { - c.emitHostStats() + } else { + // Publish Node metrics if operator has opted in + if c.config.PublishNodeMetrics { + c.emitHostStats() + } } c.emitClientMetrics() diff --git a/client/stats/host.go b/client/stats/host.go index 28d61906d711..a434c9421faa 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -1,7 +1,6 @@ package stats import ( - "fmt" "math" "runtime" "sync" @@ -117,21 +116,25 @@ func (h *HostStatsCollector) collectLocked() error { // Determine up-time uptime, err := host.Uptime() if err != nil { - return err + h.logger.Error("failed to collect upstime stats", "error", err) + uptime = 0 } hs.Uptime = uptime // Collect memory stats mstats, err := h.collectMemoryStats() if err != nil { - return err + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats cpus, ticks, err := h.collectCPUStats() if err != nil { - return err + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + ticks = 0 } hs.CPU = cpus hs.CPUTicksConsumed = ticks @@ -139,17 +142,19 @@ func (h *HostStatsCollector) collectLocked() error { // Collect disk stats diskStats, err := h.collectDiskStats() if err != nil { - return err + h.logger.Error("failed to collect disk stats", "error", err) + hs.DiskStats = []*DiskStats{} } hs.DiskStats = diskStats // Getting the disk stats for the allocation directory usage, err := disk.Usage(h.allocDir) if err != nil { - return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err) + h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err) + hs.AllocDirStats = &DiskStats{} + } else { + hs.AllocDirStats = h.toDiskStats(usage, nil) } - hs.AllocDirStats = h.toDiskStats(usage, nil) - // Collect devices stats deviceStats := h.collectDeviceGroupStats() hs.DeviceStats = deviceStats diff --git a/command/agent/metrics_endpoint_test.go b/command/agent/metrics_endpoint_test.go index d1c65d0cd8b8..d97fe2e69cac 100644 --- a/command/agent/metrics_endpoint_test.go +++ b/command/agent/metrics_endpoint_test.go @@ -121,7 +121,7 @@ func TestHTTP_FreshClientAllocMetrics(t *testing.T) { terminal == float32(numTasks), nil }, func(err error) { require.Fail("timed out waiting for metrics to converge", - "pending: %v, running: %v, terminal: %v", pending, running, terminal) + "expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal) }) }) }