From f2956e80c60c21a6f2b6ef9dcc04cdaa4a9ee63c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 29 Mar 2019 11:10:11 -0700 Subject: [PATCH] client: expose allocated memory per task Related to #4280 This PR adds `client.allocs.....memory.allocated` as a gauge in bytes to metrics to ease calculating how close a task is to OOMing. ``` 'nomad.client.allocs.memory.allocated.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 268435456.000 'nomad.client.allocs.memory.cache.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 5677056.000 'nomad.client.allocs.memory.kernel_max_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.kernel_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.max_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 8908800.000 'nomad.client.allocs.memory.rss.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 876544.000 'nomad.client.allocs.memory.swap.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 8208384.000 ``` --- CHANGELOG.md | 2 ++ client/allocrunner/taskrunner/task_runner.go | 28 +++++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ded28caa2118..4b39bd3667e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ FEATURES: IMPROVEMENTS: * core: Add node name to output of `nomad node status` command in verbose mode [[GH-5224](https://github.com/hashicorp/nomad/pull/5224)] + * telemetry: Add `client.allocs.memory.allocated` metric to expose allocated + task memory in bytes. [[GH-5492](https://github.com/hashicorp/nomad/issues/5492)] ## 0.9.0 (April 9, 2019) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index e68e2a25638d..7ccf82f880d0 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1141,6 +1141,13 @@ func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { //TODO Remove Backwardscompat or use tr.Alloc()? func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { + alloc := tr.Alloc() + var allocatedMem float32 + if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { + // Convert to bytes to match other memory metrics + allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 + } + if !tr.clientConfig.DisableTaggedMetrics { metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) @@ -1156,16 +1163,23 @@ func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) + if allocatedMem > 0 { + metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, + allocatedMem, tr.baseLabels) + } } if tr.clientConfig.BackwardsCompatibleMetrics { - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) + if allocatedMem > 0 { + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) + } } }