From 9d52c99c90667d754db033e4314fc078c6dd49ef Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 21 Apr 2021 21:31:33 -0400 Subject: [PATCH 1/3] add metrics for blocked eval resources --- api/allocations.go | 1 + nomad/blocked_evals.go | 82 +-- nomad/blocked_evals_stats.go | 209 ++++++++ nomad/blocked_evals_stats_test.go | 156 ++++++ nomad/blocked_evals_test.go | 475 ++++++++++-------- nomad/mock/mock.go | 20 + nomad/structs/structs.go | 36 +- scheduler/generic_sched.go | 6 +- scheduler/system_sched.go | 6 +- .../hashicorp/nomad/api/allocations.go | 1 + 10 files changed, 749 insertions(+), 243 deletions(-) create mode 100644 nomad/blocked_evals_stats.go create mode 100644 nomad/blocked_evals_stats_test.go diff --git a/api/allocations.go b/api/allocations.go index 0c1a01b1afd6..8dc837b390e2 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -374,6 +374,7 @@ type AllocationMetric struct { ClassExhausted map[string]int DimensionExhausted map[string]int QuotaExhausted []string + ResourcesExhausted map[string]*Resources // Deprecated, replaced with ScoreMetaData Scores map[string]float64 AllocationTime time.Duration diff --git a/nomad/blocked_evals.go b/nomad/blocked_evals.go index eba0a9f50457..4359bf7e5a0b 100644 --- a/nomad/blocked_evals.go +++ b/nomad/blocked_evals.go @@ -95,20 +95,6 @@ type wrappedEval struct { token string } -// BlockedStats returns all the stats about the blocked eval tracker. -type BlockedStats struct { - // TotalEscaped is the total number of blocked evaluations that have escaped - // computed node classes. - TotalEscaped int - - // TotalBlocked is the total number of blocked evaluations. - TotalBlocked int - - // TotalQuotaLimit is the total number of blocked evaluations that are due - // to the quota limit being reached. - TotalQuotaLimit int -} - // NewBlockedEvals creates a new blocked eval tracker that will enqueue // unblocked evals into the passed broker. func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals { @@ -123,7 +109,7 @@ func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals { capacityChangeCh: make(chan *capacityUpdate, unblockBuffer), duplicateCh: make(chan struct{}, 1), stopCh: make(chan struct{}), - stats: new(BlockedStats), + stats: NewBlockedStats(), } } @@ -209,7 +195,7 @@ func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) { // Mark the job as tracked. b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = eval.ID - b.stats.TotalBlocked++ + b.stats.Block(eval) // Track that the evaluation is being added due to reaching the quota limit if eval.QuotaLimitReached != "" { @@ -263,7 +249,7 @@ func (b *BlockedEvals) processBlockJobDuplicate(eval *structs.Evaluation) (newCa if ok { if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { delete(b.captured, existingID) - b.stats.TotalBlocked-- + b.stats.Unblock(eval) dup = existingW.eval } else { dup = eval @@ -379,7 +365,7 @@ func (b *BlockedEvals) Untrack(jobID, namespace string) { if evals, ok := b.system.JobEvals(nsID); ok { for _, e := range evals { b.system.Remove(e) - b.stats.TotalBlocked-- + b.stats.Unblock(e) } return } @@ -395,7 +381,7 @@ func (b *BlockedEvals) Untrack(jobID, namespace string) { if w, ok := b.captured[evalID]; ok { delete(b.jobs, nsID) delete(b.captured, evalID) - b.stats.TotalBlocked-- + b.stats.Unblock(w.eval) if w.eval.QuotaLimitReached != "" { b.stats.TotalQuotaLimit-- } @@ -405,7 +391,7 @@ func (b *BlockedEvals) Untrack(jobID, namespace string) { delete(b.jobs, nsID) delete(b.escaped, evalID) b.stats.TotalEscaped-- - b.stats.TotalBlocked-- + b.stats.Unblock(w.eval) if w.eval.QuotaLimitReached != "" { b.stats.TotalQuotaLimit-- } @@ -511,7 +497,7 @@ func (b *BlockedEvals) UnblockNode(nodeID string, index uint64) { for e := range evals { b.system.Remove(e) - b.stats.TotalBlocked-- + b.stats.Unblock(e) } b.evalBroker.EnqueueAll(evals) @@ -583,11 +569,13 @@ func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) { } } - if l := len(unblocked); l != 0 { + if len(unblocked) != 0 { // Update the counters b.stats.TotalEscaped = 0 - b.stats.TotalBlocked -= l b.stats.TotalQuotaLimit -= numQuotaLimit + for eval := range unblocked { + b.stats.Unblock(eval) + } // Enqueue all the unblocked evals into the broker. b.evalBroker.EnqueueAll(unblocked) @@ -630,9 +618,12 @@ func (b *BlockedEvals) UnblockFailed() { } } - if l := len(unblocked); l > 0 { - b.stats.TotalBlocked -= l + if len(unblocked) > 0 { b.stats.TotalQuotaLimit -= quotaLimit + for eval := range unblocked { + b.stats.Unblock(eval) + } + b.evalBroker.EnqueueAll(unblocked) } } @@ -683,6 +674,7 @@ func (b *BlockedEvals) Flush() { b.stats.TotalEscaped = 0 b.stats.TotalBlocked = 0 b.stats.TotalQuotaLimit = 0 + b.stats.BlockedResources = NewBlockedResourcesStats() b.captured = make(map[string]wrappedEval) b.escaped = make(map[string]wrappedEval) b.jobs = make(map[structs.NamespacedID]string) @@ -698,7 +690,7 @@ func (b *BlockedEvals) Flush() { // Stats is used to query the state of the blocked eval tracker. func (b *BlockedEvals) Stats() *BlockedStats { // Allocate a new stats struct - stats := new(BlockedStats) + stats := NewBlockedStats() b.l.RLock() defer b.l.RUnlock() @@ -707,6 +699,8 @@ func (b *BlockedEvals) Stats() *BlockedStats { stats.TotalEscaped = b.stats.TotalEscaped stats.TotalBlocked = b.stats.TotalBlocked stats.TotalQuotaLimit = b.stats.TotalQuotaLimit + stats.BlockedResources = b.stats.BlockedResources.Copy() + return stats } @@ -719,6 +713,24 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit)) metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked)) metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped)) + + for k, v := range stats.BlockedResources.ByJob { + labels := []metrics.Label{ + {Name: "namespace", Value: k.Namespace}, + {Name: "job", Value: k.ID}, + } + metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "cpu"}, float32(v.CPU), labels) + metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels) + } + + for k, v := range stats.BlockedResources.ByNodeInfo { + labels := []metrics.Label{ + {Name: "datacenter", Value: k.Datacenter}, + {Name: "node_class", Value: k.NodeClass}, + } + metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels) + metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels) + } case <-stopCh: return } @@ -734,15 +746,17 @@ func (b *BlockedEvals) prune(stopCh <-chan struct{}) { select { case <-stopCh: return - case <-ticker.C: - b.pruneUnblockIndexes() + case t := <-ticker.C: + cutoff := t.UTC().Add(-1 * pruneThreshold) + b.pruneUnblockIndexes(cutoff) + b.stats.prune(cutoff) } } } // pruneUnblockIndexes is used to prune any tracked entry that is excessively // old. This protects againsts unbounded growth of the map. -func (b *BlockedEvals) pruneUnblockIndexes() { +func (b *BlockedEvals) pruneUnblockIndexes(cutoff time.Time) { b.l.Lock() defer b.l.Unlock() @@ -750,12 +764,18 @@ func (b *BlockedEvals) pruneUnblockIndexes() { return } - cutoff := time.Now().UTC().Add(-1 * pruneThreshold) oldThreshold := b.timetable.NearestIndex(cutoff) - for key, index := range b.unblockIndexes { if index < oldThreshold { delete(b.unblockIndexes, key) } } } + +// pruneStats is used to prune any zero value stats that are excessively old. +func (b *BlockedEvals) pruneStats(cutoff time.Time) { + b.l.Lock() + defer b.l.Unlock() + + b.stats.prune(cutoff) +} diff --git a/nomad/blocked_evals_stats.go b/nomad/blocked_evals_stats.go new file mode 100644 index 000000000000..171f4f9fdd3f --- /dev/null +++ b/nomad/blocked_evals_stats.go @@ -0,0 +1,209 @@ +package nomad + +import ( + "time" + + "github.com/hashicorp/nomad/nomad/structs" +) + +// BlockedStats returns all the stats about the blocked eval tracker. +type BlockedStats struct { + // TotalEscaped is the total number of blocked evaluations that have escaped + // computed node classes. + TotalEscaped int + + // TotalBlocked is the total number of blocked evaluations. + TotalBlocked int + + // TotalQuotaLimit is the total number of blocked evaluations that are due + // to the quota limit being reached. + TotalQuotaLimit int + + // BlockedResources stores the amount of resources requested by blocked + // evaluations. + BlockedResources BlockedResourcesStats +} + +// NewBlockedStats returns a new BlockedStats. +func NewBlockedStats() *BlockedStats { + return &BlockedStats{ + BlockedResources: NewBlockedResourcesStats(), + } +} + +// Block updates the stats for the blocked eval tracker with the details of the +// evaluation being blocked. +func (b *BlockedStats) Block(eval *structs.Evaluation) { + b.TotalBlocked++ + resourceStats := generateResourceStats(eval) + b.BlockedResources = b.BlockedResources.Add(resourceStats) +} + +// Unblock updates the stats for the blocked eval tracker with the details of the +// evaluation being unblocked. +func (b *BlockedStats) Unblock(eval *structs.Evaluation) { + b.TotalBlocked-- + resourceStats := generateResourceStats(eval) + b.BlockedResources = b.BlockedResources.Subtract(resourceStats) +} + +// prune deletes any key zero metric values older than the cutoff. +func (b *BlockedStats) prune(cutoff time.Time) { + shouldPrune := func(s BlockedResourcesSummary) bool { + return s.Timestamp.Before(cutoff) && s.IsZero() + } + + for k, v := range b.BlockedResources.ByJob { + if shouldPrune(v) { + delete(b.BlockedResources.ByJob, k) + } + } + + for k, v := range b.BlockedResources.ByNodeInfo { + if shouldPrune(v) { + delete(b.BlockedResources.ByNodeInfo, k) + } + } +} + +// generateResourceStats returns a summary of the resources requested by the +// input evaluation. +func generateResourceStats(eval *structs.Evaluation) BlockedResourcesStats { + dcs := make(map[string]struct{}) + classes := make(map[string]struct{}) + + resources := BlockedResourcesSummary{ + Timestamp: time.Now().UTC(), + } + + for _, allocMetrics := range eval.FailedTGAllocs { + for dc := range allocMetrics.NodesAvailable { + dcs[dc] = struct{}{} + } + + for class := range allocMetrics.ClassExhausted { + classes[class] = struct{}{} + } + + for _, r := range allocMetrics.ResourcesExhausted { + resources.CPU += r.CPU + resources.MemoryMB += r.MemoryMB + } + } + + byJob := make(map[structs.NamespacedID]BlockedResourcesSummary) + byJob[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = resources + + byNodeInfo := make(map[NodeInfo]BlockedResourcesSummary) + for dc := range dcs { + for class := range classes { + k := NodeInfo{dc, class} + byNodeInfo[k] = resources + } + } + + return BlockedResourcesStats{ + ByJob: byJob, + ByNodeInfo: byNodeInfo, + } +} + +// BlockedResourcesStats stores resources requested by block evaluations +// split into different dimensions. +type BlockedResourcesStats struct { + ByJob map[structs.NamespacedID]BlockedResourcesSummary + ByNodeInfo map[NodeInfo]BlockedResourcesSummary +} + +// NewBlockedResourcesStats returns a new BlockedResourcesStats. +func NewBlockedResourcesStats() BlockedResourcesStats { + return BlockedResourcesStats{ + ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary), + ByNodeInfo: make(map[NodeInfo]BlockedResourcesSummary), + } +} + +// Copy returns a deep copy of the blocked resource stats. +func (b BlockedResourcesStats) Copy() BlockedResourcesStats { + result := NewBlockedResourcesStats() + + for k, v := range b.ByJob { + result.ByJob[k] = v + } + + for k, v := range b.ByNodeInfo { + result.ByNodeInfo[k] = v + } + + return result +} + +// Add returns a new BlockedResourcesStats with the values set to the current +// resource values plus the input. +func (b BlockedResourcesStats) Add(a BlockedResourcesStats) BlockedResourcesStats { + result := b.Copy() + + for k, v := range a.ByJob { + result.ByJob[k] = b.ByJob[k].Add(v) + } + + for k, v := range a.ByNodeInfo { + result.ByNodeInfo[k] = b.ByNodeInfo[k].Add(v) + } + + return result +} + +// Subtract returns a new BlockedResourcesStats with the values set to the +// current resource values minus the input. +func (b BlockedResourcesStats) Subtract(a BlockedResourcesStats) BlockedResourcesStats { + result := b.Copy() + + for k, v := range a.ByJob { + result.ByJob[k] = b.ByJob[k].Subtract(v) + } + + for k, v := range a.ByNodeInfo { + result.ByNodeInfo[k] = b.ByNodeInfo[k].Subtract(v) + } + + return result +} + +// NodeInfo stores information related to nodes. +type NodeInfo struct { + Datacenter string + NodeClass string +} + +// BlockedResourcesSummary stores resource values for blocked evals. +type BlockedResourcesSummary struct { + Timestamp time.Time + CPU int + MemoryMB int +} + +// Add returns a new BlockedResourcesSummary with each resource set to the +// current value plus the input. +func (b BlockedResourcesSummary) Add(a BlockedResourcesSummary) BlockedResourcesSummary { + return BlockedResourcesSummary{ + Timestamp: a.Timestamp, + CPU: b.CPU + a.CPU, + MemoryMB: b.MemoryMB + a.MemoryMB, + } +} + +// Subtract returns a new BlockedResourcesSummary with each resource set to the +// current value minus the input. +func (b BlockedResourcesSummary) Subtract(a BlockedResourcesSummary) BlockedResourcesSummary { + return BlockedResourcesSummary{ + Timestamp: a.Timestamp, + CPU: b.CPU - a.CPU, + MemoryMB: b.MemoryMB - a.MemoryMB, + } +} + +// IsZero returns true if all resource values are zero. +func (b BlockedResourcesSummary) IsZero() bool { + return b.CPU == 0 && b.MemoryMB == 0 +} diff --git a/nomad/blocked_evals_stats_test.go b/nomad/blocked_evals_stats_test.go new file mode 100644 index 000000000000..235e451c19db --- /dev/null +++ b/nomad/blocked_evals_stats_test.go @@ -0,0 +1,156 @@ +package nomad + +import ( + "fmt" + "math/rand" + "reflect" + "testing" + "testing/quick" + "time" + + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" +) + +// testBlockedEvalsRandomBlockedEval wraps an eval that is randomly generated. +type testBlockedEvalsRandomBlockedEval struct { + eval *structs.Evaluation +} + +// Generate returns a random eval. +func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) reflect.Value { + resourceTypes := []string{"cpu", "memory"} + + // Start with a mock eval. + e := mock.BlockedEval() + + // Get how many task groups, datacenters and node classes to generate. + // Add 1 to avoid 0. + tgCount := rand.Intn(10) + 1 + dcCount := rand.Intn(3) + 1 + nodeClassCount := rand.Intn(3) + 1 + + failedTGAllocs := map[string]*structs.AllocMetric{} + + for tg := 1; tg <= tgCount; tg++ { + tgName := fmt.Sprintf("group-%d", tg) + + // Get which resource type to use for this task group. + // Nomad stops at the first dimension that is exhausted, so only 1 is + // added per task group. + i := rand.Int() % len(resourceTypes) + resourceType := resourceTypes[i] + + failedTGAllocs[tgName] = &structs.AllocMetric{ + DimensionExhausted: map[string]int{ + resourceType: 1, + }, + NodesAvailable: map[string]int{}, + ClassExhausted: map[string]int{}, + } + + for dc := 1; dc <= dcCount; dc++ { + dcName := fmt.Sprintf("dc%d", dc) + failedTGAllocs[tgName].NodesAvailable[dcName] = 1 + } + + for nc := 1; nc <= nodeClassCount; nc++ { + nodeClassName := fmt.Sprintf("node-class-%d", nc) + failedTGAllocs[tgName].ClassExhausted[nodeClassName] = 1 + } + + // Generate resources for each task. + taskCount := rand.Intn(5) + 1 + resourcesExhausted := map[string]*structs.Resources{} + + for t := 1; t <= taskCount; t++ { + task := fmt.Sprintf("task-%d", t) + resourcesExhausted[task] = &structs.Resources{} + + resourceAmount := rand.Intn(1000) + switch resourceType { + case "cpu": + resourcesExhausted[task].CPU = resourceAmount + case "memory": + resourcesExhausted[task].MemoryMB = resourceAmount + } + } + failedTGAllocs[tgName].ResourcesExhausted = resourcesExhausted + } + e.FailedTGAllocs = failedTGAllocs + t.eval = e + return reflect.ValueOf(t) +} + +// clearTimestampFromBlockedResourceStats set timestamp metrics to zero to +// avoid invalid comparisons. +func clearTimestampFromBlockedResourceStats(b *BlockedResourcesStats) { + for k, v := range b.ByJob { + v.Timestamp = time.Time{} + b.ByJob[k] = v + } + for k, v := range b.ByNodeInfo { + v.Timestamp = time.Time{} + b.ByNodeInfo[k] = v + } +} + +// TestBlockedEvalsStats_BlockedResources generates random evals and processes +// them using the expected code paths and a manual check of the expeceted result. +func TestBlockedEvalsStats_BlockedResources(t *testing.T) { + t.Parallel() + blocked, _ := testBlockedEvals(t) + + // evalHistory stores all evals generated during the test. + evalHistory := []*structs.Evaluation{} + + // blockedEvals keeps track if evals are blocked or unblocked. + blockedEvals := map[string]bool{} + + // blockAndUntrack processes the generated evals in order using a + // BlockedEvals instance. + blockAndUntrack := func(testEval testBlockedEvalsRandomBlockedEval, block bool, unblockIdx uint16) BlockedResourcesStats { + if block || len(evalHistory) == 0 { + blocked.Block(testEval.eval) + } else { + i := int(unblockIdx) % len(evalHistory) + eval := evalHistory[i] + blocked.Untrack(eval.JobID, eval.Namespace) + } + + // Remove zero stats from unblocked evals. + blocked.pruneStats(time.Now().UTC()) + + result := blocked.Stats().BlockedResources + clearTimestampFromBlockedResourceStats(&result) + return result + } + + // manualCount processes only the blocked evals and generate a + // BlockedResourcesStats result directly from the eval history. + manualCount := func(testEval testBlockedEvalsRandomBlockedEval, block bool, unblockIdx uint16) BlockedResourcesStats { + if block || len(evalHistory) == 0 { + evalHistory = append(evalHistory, testEval.eval) + blockedEvals[testEval.eval.ID] = true + } else { + i := int(unblockIdx) % len(evalHistory) + eval := evalHistory[i] + blockedEvals[eval.ID] = false + } + + result := NewBlockedResourcesStats() + for _, e := range evalHistory { + if !blockedEvals[e.ID] { + continue + } + result = result.Add(generateResourceStats(e)) + } + clearTimestampFromBlockedResourceStats(&result) + return result + } + + err := quick.CheckEqual(blockAndUntrack, manualCount, nil) + if err != nil { + t.Error(err) + } +} diff --git a/nomad/blocked_evals_test.go b/nomad/blocked_evals_test.go index f5c2961f8bfe..244308fba8eb 100644 --- a/nomad/blocked_evals_test.go +++ b/nomad/blocked_evals_test.go @@ -2,7 +2,6 @@ package nomad import ( "fmt" - "reflect" "testing" "time" @@ -23,108 +22,115 @@ func testBlockedEvals(t *testing.T) (*BlockedEvals, *EvalBroker) { func TestBlockedEvals_Block_Disabled(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) blocked.SetEnabled(false) // Create an escaped eval and add it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.EscapedComputedClass = true blocked.Block(e) - // Verify block did nothing - bStats := blocked.Stats() - if bStats.TotalBlocked != 0 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + // Verify block did nothing. + blockedStats := blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) } func TestBlockedEvals_Block_SameJob(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) // Create two blocked evals and add them to the blocked tracker. - e := mock.Eval() - e2 := mock.Eval() + e := mock.BlockedEval() + e2 := mock.BlockedEval() e2.JobID = e.JobID blocked.Block(e) blocked.Block(e2) - // Verify block did track both - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + // Verify block didn't track duplicate. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) } func TestBlockedEvals_Block_Quota(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) - // Create a blocked evals on quota - e := mock.Eval() + // Create a blocked eval on quota. + e := mock.BlockedEval() e.QuotaLimitReached = "foo" blocked.Block(e) - // Verify block did track both - bs := blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalEscaped != 0 || bs.TotalQuotaLimit != 1 { - t.Fatalf("bad: %#v", bs) - } + // Verify block did track eval. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(1, blockedStats.TotalQuotaLimit) } func TestBlockedEvals_Block_PriorUnblocks(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) - // Do unblocks prior to blocking + // Do unblocks prior to blocking. blocked.Unblock("v1:123", 1000) blocked.Unblock("v1:123", 1001) - // Create two blocked evals and add them to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + // Create blocked eval with two classes ineligible. + e := mock.BlockedEval() e.ClassEligibility = map[string]bool{"v1:123": false, "v1:456": false} e.SnapshotIndex = 999 blocked.Block(e) - // Verify block did track both - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + // Verify block did track eval. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) } func TestBlockedEvals_GetDuplicates(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) // Create duplicate blocked evals and add them to the blocked tracker. - e := mock.Eval() + e := mock.BlockedEval() e.CreateIndex = 100 - e2 := mock.Eval() + e2 := mock.BlockedEval() e2.JobID = e.JobID e2.CreateIndex = 101 - e3 := mock.Eval() + e3 := mock.BlockedEval() e3.JobID = e.JobID e3.CreateIndex = 102 - e4 := mock.Eval() + e4 := mock.BlockedEval() e4.JobID = e.JobID e4.CreateIndex = 100 blocked.Block(e) blocked.Block(e2) - // Verify stats such that we are only tracking one - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + // Verify stats such that we are only tracking one. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Get the duplicates. out := blocked.GetDuplicates(0) - if len(out) != 1 || !reflect.DeepEqual(out[0], e) { - t.Fatalf("bad: %#v %#v", out, e) - } + require.Len(out, 1) + require.Equal(e, out[0]) // Call block again after a small sleep. go func() { @@ -134,45 +140,45 @@ func TestBlockedEvals_GetDuplicates(t *testing.T) { // Get the duplicates. out = blocked.GetDuplicates(1 * time.Second) - if len(out) != 1 || !reflect.DeepEqual(out[0], e2) { - t.Fatalf("bad: %#v %#v", out, e2) - } + require.Len(out, 1) + require.Equal(e2, out[0]) - // Verify stats such that we are only tracking one - bStats = blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + // Verify stats such that we are only tracking one. + blockedStats = blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) - // Add an older evaluation and assert it gets cancelled + // Add an older evaluation and assert it gets cancelled. blocked.Block(e4) out = blocked.GetDuplicates(0) - if len(out) != 1 || !reflect.DeepEqual(out[0], e4) { - t.Fatalf("bad: %#v %#v", out, e4) - } - - // Verify stats such that we are only tracking one - bStats = blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + require.Len(out, 1) + require.Equal(e4, out[0]) + + // Verify stats such that we are only tracking one. + blockedStats = blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) } func TestBlockedEvals_UnblockEscaped(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create an escaped eval and add it to the blocked tracker. - e := mock.Eval() + e := mock.BlockedEval() e.Status = structs.EvalStatusBlocked e.EscapedComputedClass = true blocked.Block(e) // Verify block caused the eval to be tracked - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 1 { - t.Fatalf("bad: %#v", bStats) - } + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(1, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) blocked.Unblock("v1:123", 1000) requireBlockedEvalsEnqueued(t, blocked, broker, 1) @@ -186,10 +192,16 @@ func requireBlockedEvalsEnqueued(t *testing.T, blocked *BlockedEvals, broker *Ev return false, fmt.Errorf("missing enqueued evals: %#v", brokerStats) } + // Prune old and empty metrics. + blocked.pruneStats(time.Now().UTC()) + // Verify Unblock updates the stats - bStats := blocked.Stats() - if bStats.TotalBlocked != 0 || bStats.TotalEscaped != 0 { - return false, fmt.Errorf("evals still blocked: %#v", bStats) + blockedStats := blocked.Stats() + ok := blockedStats.TotalBlocked == 0 && + blockedStats.TotalEscaped == 0 && + len(blockedStats.BlockedResources.ByJob) == 0 + if !ok { + return false, fmt.Errorf("evals still blocked: %#v", blockedStats) } return true, nil }, func(err error) { @@ -199,20 +211,20 @@ func requireBlockedEvalsEnqueued(t *testing.T, blocked *BlockedEvals, broker *Ev func TestBlockedEvals_UnblockEligible(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() + e := mock.BlockedEval() e.Status = structs.EvalStatusBlocked e.ClassEligibility = map[string]bool{"v1:123": true} blocked.Block(e) // Verify block caused the eval to be tracked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 1 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(1, blockedStats.TotalBlocked) blocked.Unblock("v1:123", 1000) requireBlockedEvalsEnqueued(t, blocked, broker, 1) @@ -220,20 +232,21 @@ func TestBlockedEvals_UnblockEligible(t *testing.T) { func TestBlockedEvals_UnblockIneligible(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create a blocked eval that is ineligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.ClassEligibility = map[string]bool{"v1:123": false} blocked.Block(e) // Verify block caused the eval to be tracked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 1 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Should do nothing blocked.Unblock("v1:123", 1000) @@ -242,12 +255,18 @@ func TestBlockedEvals_UnblockIneligible(t *testing.T) { // Verify Unblock didn't cause an enqueue brokerStats := broker.Stats() if brokerStats.TotalReady != 0 { - return false, fmt.Errorf("bad: %#v", brokerStats) + return false, fmt.Errorf("eval unblocked: %#v", brokerStats) } - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - return false, fmt.Errorf("bad: %#v", bStats) + // Prune old and empty metrics. + blocked.pruneStats(time.Now().UTC()) + + blockedStats := blocked.Stats() + ok := blockedStats.TotalBlocked == 1 && + blockedStats.TotalEscaped == 0 && + len(blockedStats.BlockedResources.ByJob) == 1 + if !ok { + return false, fmt.Errorf("eval unblocked: %#v", blockedStats) } return true, nil }, func(err error) { @@ -257,20 +276,21 @@ func TestBlockedEvals_UnblockIneligible(t *testing.T) { func TestBlockedEvals_UnblockUnknown(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create a blocked eval that is ineligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false} blocked.Block(e) - // Verify block caused the eval to be tracked + // Verify block caused the eval to be tracked. blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 1 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Should unblock because the eval hasn't seen this node class. blocked.Unblock("v1:789", 1000) @@ -279,19 +299,20 @@ func TestBlockedEvals_UnblockUnknown(t *testing.T) { func TestBlockedEvals_UnblockEligible_Quota(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) - // Create a blocked eval that is eligible for a particular quota - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + // Create a blocked eval that is eligible for a particular quota. + e := mock.BlockedEval() e.QuotaLimitReached = "foo" blocked.Block(e) - // Verify block caused the eval to be tracked - bs := blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalQuotaLimit != 1 { - t.Fatalf("bad: %#v", bs) - } + // Verify block caused the eval to be tracked. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(1, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 1) blocked.UnblockQuota("foo", 1000) requireBlockedEvalsEnqueued(t, blocked, broker, 1) @@ -299,33 +320,41 @@ func TestBlockedEvals_UnblockEligible_Quota(t *testing.T) { func TestBlockedEvals_UnblockIneligible_Quota(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) - // Create a blocked eval that is eligible on a specific quota - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + // Create a blocked eval that is eligible on a specific quota. + e := mock.BlockedEval() e.QuotaLimitReached = "foo" blocked.Block(e) - // Verify block caused the eval to be tracked - bs := blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalQuotaLimit != 1 { - t.Fatalf("bad: %#v", bs) - } + // Verify block caused the eval to be tracked. + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(1, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 1) - // Should do nothing + // Should do nothing. blocked.UnblockQuota("bar", 1000) testutil.WaitForResult(func() (bool, error) { // Verify Unblock didn't cause an enqueue brokerStats := broker.Stats() if brokerStats.TotalReady != 0 { - return false, fmt.Errorf("bad: %#v", brokerStats) + return false, fmt.Errorf("eval unblocked: %#v", brokerStats) } - bs := blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalEscaped != 0 || bs.TotalQuotaLimit != 1 { - return false, fmt.Errorf("bad: %#v", bs) + // Prune old and empty metrics. + blocked.pruneStats(time.Now().UTC()) + + blockedStats := blocked.Stats() + ok := blockedStats.TotalBlocked == 1 && + blockedStats.TotalEscaped == 0 && + blockedStats.TotalQuotaLimit == 1 && + len(blockedStats.BlockedResources.ByJob) == 1 + if !ok { + return false, fmt.Errorf("eval unblocked: %#v", blockedStats) } return true, nil }, func(err error) { @@ -335,42 +364,39 @@ func TestBlockedEvals_UnblockIneligible_Quota(t *testing.T) { func TestBlockedEvals_Reblock(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create an evaluation, Enqueue/Dequeue it to get a token - e := mock.Eval() + e := mock.BlockedEval() e.SnapshotIndex = 500 - e.Status = structs.EvalStatusBlocked e.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false} broker.Enqueue(e) _, token, err := broker.Dequeue([]string{e.Type}, time.Second) - if err != nil { - t.Fatalf("err: %v", err) - } + require.NoError(err) // Reblock the evaluation blocked.Reblock(e, token) // Verify block caused the eval to be tracked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 1 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Should unblock because the eval blocked.Unblock("v1:123", 1000) brokerStats := broker.Stats() - if brokerStats.TotalReady != 0 && brokerStats.TotalUnacked != 1 { - t.Fatalf("bad: %#v", brokerStats) - } + require.Equal(0, brokerStats.TotalReady) + require.Equal(1, brokerStats.TotalUnacked) // Ack the evaluation which should cause the reblocked eval to transition // to ready - if err := broker.Ack(e.ID, token); err != nil { - t.Fatalf("err: %v", err) - } + err = broker.Ack(e.ID, token) + require.NoError(err) requireBlockedEvalsEnqueued(t, blocked, broker, 1) } @@ -379,6 +405,8 @@ func TestBlockedEvals_Reblock(t *testing.T) { // it is escaped and old func TestBlockedEvals_Block_ImmediateUnblock_Escaped(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Do an unblock prior to blocking @@ -386,17 +414,16 @@ func TestBlockedEvals_Block_ImmediateUnblock_Escaped(t *testing.T) { // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.EscapedComputedClass = true e.SnapshotIndex = 900 blocked.Block(e) // Verify block caused the eval to be immediately unblocked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) requireBlockedEvalsEnqueued(t, blocked, broker, 1) } @@ -406,6 +433,8 @@ func TestBlockedEvals_Block_ImmediateUnblock_Escaped(t *testing.T) { // scheduler func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass_After(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Do an unblock prior to blocking @@ -413,17 +442,16 @@ func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass_After(t *testing.T) { // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.EscapedComputedClass = false e.SnapshotIndex = 900 blocked.Block(e) // Verify block caused the eval to be immediately unblocked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) requireBlockedEvalsEnqueued(t, blocked, broker, 1) } @@ -433,6 +461,8 @@ func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass_After(t *testing.T) { // scheduler func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass_Before(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) // Do an unblock prior to blocking @@ -440,23 +470,24 @@ func TestBlockedEvals_Block_ImmediateUnblock_UnseenClass_Before(t *testing.T) { // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.EscapedComputedClass = false e.SnapshotIndex = 900 blocked.Block(e) // Verify block caused the eval to be immediately unblocked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 1 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) } // Test the block case in which the eval should be immediately unblocked since // it a class it is eligible for has been unblocked func TestBlockedEvals_Block_ImmediateUnblock_SeenClass(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Do an unblock prior to blocking @@ -464,17 +495,16 @@ func TestBlockedEvals_Block_ImmediateUnblock_SeenClass(t *testing.T) { // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false} e.SnapshotIndex = 900 blocked.Block(e) // Verify block caused the eval to be immediately unblocked blockedStats := blocked.Stats() - if blockedStats.TotalBlocked != 0 && blockedStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", blockedStats) - } + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) requireBlockedEvalsEnqueued(t, blocked, broker, 1) } @@ -483,6 +513,8 @@ func TestBlockedEvals_Block_ImmediateUnblock_SeenClass(t *testing.T) { // it a quota has changed that it is using func TestBlockedEvals_Block_ImmediateUnblock_Quota(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Do an unblock prior to blocking @@ -490,40 +522,40 @@ func TestBlockedEvals_Block_ImmediateUnblock_Quota(t *testing.T) { // Create a blocked eval that is eligible on a specific node class and add // it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.QuotaLimitReached = "my-quota" e.SnapshotIndex = 900 blocked.Block(e) // Verify block caused the eval to be immediately unblocked - bs := blocked.Stats() - if bs.TotalBlocked != 0 && bs.TotalEscaped != 0 && bs.TotalQuotaLimit != 0 { - t.Fatalf("bad: %#v", bs) - } + blockedStats := blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 0) requireBlockedEvalsEnqueued(t, blocked, broker, 1) } func TestBlockedEvals_UnblockFailed(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) // Create blocked evals that are due to failures - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + e := mock.BlockedEval() e.TriggeredBy = structs.EvalTriggerMaxPlans e.EscapedComputedClass = true blocked.Block(e) - e2 := mock.Eval() + e2 := mock.BlockedEval() e2.Status = structs.EvalStatusBlocked e2.TriggeredBy = structs.EvalTriggerMaxPlans e2.ClassEligibility = map[string]bool{"v1:123": true, "v1:456": false} blocked.Block(e2) - e3 := mock.Eval() - e3.Status = structs.EvalStatusBlocked + e3 := mock.BlockedEval() e3.TriggeredBy = structs.EvalTriggerMaxPlans e3.QuotaLimitReached = "foo" blocked.Block(e3) @@ -531,98 +563,116 @@ func TestBlockedEvals_UnblockFailed(t *testing.T) { // Trigger an unblock fail blocked.UnblockFailed() + // Prune old and empty metrics. + blocked.pruneStats(time.Now().UTC()) + // Verify UnblockFailed caused the eval to be immediately unblocked - bs := blocked.Stats() - if bs.TotalBlocked != 0 || bs.TotalEscaped != 0 || bs.TotalQuotaLimit != 0 { - t.Fatalf("bad: %#v", bs) - } + blockedStats := blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 0) requireBlockedEvalsEnqueued(t, blocked, broker, 3) // Reblock an eval for the same job and check that it gets tracked. blocked.Block(e) - bs = blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalEscaped != 1 { - t.Fatalf("bad: %#v", bs) - } + blockedStats = blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(1, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) } func TestBlockedEvals_Untrack(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) - // Create two blocked evals and add them to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + // Create blocked eval and add to the blocked tracker. + e := mock.BlockedEval() e.ClassEligibility = map[string]bool{"v1:123": false, "v1:456": false} e.SnapshotIndex = 1000 blocked.Block(e) // Verify block did track - bStats := blocked.Stats() - if bStats.TotalBlocked != 1 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Untrack and verify blocked.Untrack(e.JobID, e.Namespace) - bStats = blocked.Stats() - if bStats.TotalBlocked != 0 || bStats.TotalEscaped != 0 { - t.Fatalf("bad: %#v", bStats) - } + blocked.pruneStats(time.Now().UTC()) + + blockedStats = blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) } func TestBlockedEvals_Untrack_Quota(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) - // Create a blocked evals and add it to the blocked tracker. - e := mock.Eval() - e.Status = structs.EvalStatusBlocked + // Create a blocked eval and add it to the blocked tracker. + e := mock.BlockedEval() e.QuotaLimitReached = "foo" e.SnapshotIndex = 1000 blocked.Block(e) // Verify block did track - bs := blocked.Stats() - if bs.TotalBlocked != 1 || bs.TotalEscaped != 0 || bs.TotalQuotaLimit != 1 { - t.Fatalf("bad: %#v", bs) - } + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Untrack and verify blocked.Untrack(e.JobID, e.Namespace) - bs = blocked.Stats() - if bs.TotalBlocked != 0 || bs.TotalEscaped != 0 || bs.TotalQuotaLimit != 0 { - t.Fatalf("bad: %#v", bs) - } + blocked.pruneStats(time.Now().UTC()) + + blockedStats = blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Len(blockedStats.BlockedResources.ByJob, 0) } func TestBlockedEvals_UnblockNode(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, broker := testBlockedEvals(t) require.NotNil(t, broker) // Create a blocked evals and add it to the blocked tracker. - e := mock.Eval() + e := mock.BlockedEval() e.Type = structs.JobTypeSystem e.NodeID = "foo" e.SnapshotIndex = 999 blocked.Block(e) // Verify block did track - bs := blocked.Stats() - require.Equal(t, 1, bs.TotalBlocked) + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Len(blockedStats.BlockedResources.ByJob, 1) blocked.UnblockNode("foo", 1000) requireBlockedEvalsEnqueued(t, blocked, broker, 1) - bs = blocked.Stats() - require.Empty(t, blocked.system.byNode) - require.Equal(t, 0, bs.TotalBlocked) + + blocked.pruneStats(time.Now().UTC()) + blockedStats = blocked.Stats() + require.Empty(blocked.system.byNode) + require.Equal(0, blockedStats.TotalBlocked) + require.Len(blockedStats.BlockedResources.ByJob, 0) } func TestBlockedEvals_SystemUntrack(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) // Create a blocked evals and add it to the blocked tracker. @@ -632,21 +682,26 @@ func TestBlockedEvals_SystemUntrack(t *testing.T) { blocked.Block(e) // Verify block did track - bs := blocked.Stats() - require.Equal(t, 1, bs.TotalBlocked) - require.Equal(t, 0, bs.TotalEscaped) - require.Equal(t, 0, bs.TotalQuotaLimit) + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Untrack and verify blocked.Untrack(e.JobID, e.Namespace) - bs = blocked.Stats() - require.Equal(t, 0, bs.TotalBlocked) - require.Equal(t, 0, bs.TotalEscaped) - require.Equal(t, 0, bs.TotalQuotaLimit) + blocked.pruneStats(time.Now().UTC()) + blockedStats = blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 0) } func TestBlockedEvals_SystemDisableFlush(t *testing.T) { t.Parallel() + require := require.New(t) + blocked, _ := testBlockedEvals(t) // Create a blocked evals and add it to the blocked tracker. @@ -656,18 +711,20 @@ func TestBlockedEvals_SystemDisableFlush(t *testing.T) { blocked.Block(e) // Verify block did track - bs := blocked.Stats() - require.Equal(t, 1, bs.TotalBlocked) - require.Equal(t, 0, bs.TotalEscaped) - require.Equal(t, 0, bs.TotalQuotaLimit) + blockedStats := blocked.Stats() + require.Equal(1, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 1) // Disable empties blocked.SetEnabled(false) - bs = blocked.Stats() - require.Equal(t, 0, bs.TotalBlocked) - require.Equal(t, 0, bs.TotalEscaped) - require.Equal(t, 0, bs.TotalQuotaLimit) - require.Empty(t, blocked.system.evals) - require.Empty(t, blocked.system.byJob) - require.Empty(t, blocked.system.byNode) + blockedStats = blocked.Stats() + require.Equal(0, blockedStats.TotalBlocked) + require.Equal(0, blockedStats.TotalEscaped) + require.Equal(0, blockedStats.TotalQuotaLimit) + require.Len(blockedStats.BlockedResources.ByJob, 0) + require.Empty(blocked.system.evals) + require.Empty(blocked.system.byJob) + require.Empty(blocked.system.byNode) } diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 0c91bc5e06ee..6a54a60d085b 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -1092,6 +1092,26 @@ func Eval() *structs.Evaluation { return eval } +func BlockedEval() *structs.Evaluation { + e := Eval() + e.Status = structs.EvalStatusBlocked + e.FailedTGAllocs = map[string]*structs.AllocMetric{ + "cache": { + DimensionExhausted: map[string]int{ + "memory": 1, + }, + ResourcesExhausted: map[string]*structs.Resources{ + "redis": { + CPU: 100, + MemoryMB: 1024, + }, + }, + }, + } + + return e +} + func JobSummary(jobID string) *structs.JobSummary { js := &structs.JobSummary{ JobID: jobID, diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 637aad4ce9fa..10bcab0ac654 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -9749,6 +9749,10 @@ type AllocMetric struct { // QuotaExhausted provides the exhausted dimensions QuotaExhausted []string + // ResourcesExhausted provides the amount of resources exhausted by task + // during the allocation placement + ResourcesExhausted map[string]*Resources + // Scores is the scores of the final few nodes remaining // for placement. The top score is typically selected. // Deprecated: Replaced by ScoreMetaData in Nomad 0.9 @@ -9837,6 +9841,35 @@ func (a *AllocMetric) ExhaustQuota(dimensions []string) { a.QuotaExhausted = append(a.QuotaExhausted, dimensions...) } +// ExhaustResources updates the amount of resources exhausted for the +// allocation because of the given task group. +func (a *AllocMetric) ExhaustResources(tg *TaskGroup) { + if a.DimensionExhausted == nil { + return + } + + if a.ResourcesExhausted == nil { + a.ResourcesExhausted = make(map[string]*Resources) + } + + for _, t := range tg.Tasks { + exhaustedResources := a.ResourcesExhausted[t.Name] + if exhaustedResources == nil { + exhaustedResources = &Resources{} + } + + if a.DimensionExhausted["memory"] > 0 { + exhaustedResources.MemoryMB += t.Resources.MemoryMB + } + + if a.DimensionExhausted["cpu"] > 0 { + exhaustedResources.CPU += t.Resources.CPU + } + + a.ResourcesExhausted[t.Name] = exhaustedResources + } +} + // ScoreNode is used to gather top K scoring nodes in a heap func (a *AllocMetric) ScoreNode(node *Node, name string, score float64) { // Create nodeScoreMeta lazily if its the first time or if its a new node @@ -10308,7 +10341,7 @@ func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation { // ineligible, whether the job has escaped computed node classes and whether the // quota limit was reached. func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, - escaped bool, quotaReached string) *Evaluation { + escaped bool, quotaReached string, failedTGAllocs map[string]*AllocMetric) *Evaluation { now := time.Now().UTC().UnixNano() return &Evaluation{ ID: uuid.Generate(), @@ -10320,6 +10353,7 @@ func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, JobModifyIndex: e.JobModifyIndex, Status: EvalStatusBlocked, PreviousEval: e.ID, + FailedTGAllocs: failedTGAllocs, ClassEligibility: classEligibility, EscapedComputedClass: escaped, QuotaLimitReached: quotaReached, diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 852528874eb2..bbc4f5c521e0 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -200,7 +200,7 @@ func (s *GenericScheduler) createBlockedEval(planFailure bool) error { classEligibility = e.GetClasses() } - s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) + s.blocked = s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached(), s.failedTGAllocs) if planFailure { s.blocked.TriggeredBy = structs.EvalTriggerMaxPlans s.blocked.StatusDescription = blockedEvalMaxPlanDesc @@ -520,6 +520,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul // Check if this task group has already failed if metric, ok := s.failedTGAllocs[tg.Name]; ok { metric.CoalescedFailures += 1 + metric.ExhaustResources(tg) continue } @@ -627,6 +628,9 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul s.failedTGAllocs = make(map[string]*structs.AllocMetric) } + // Update metrics with the resources requested by the task group. + s.ctx.Metrics().ExhaustResources(tg) + // Track the fact that we didn't find a placement s.failedTGAllocs[tg.Name] = s.ctx.Metrics() diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 22684beb8af0..713546731fda 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -311,6 +311,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { // Check if this task group has already failed, reported to the user as a count if metric, ok := s.failedTGAllocs[missing.TaskGroup.Name]; ok { metric.CoalescedFailures += 1 + metric.ExhaustResources(missing.TaskGroup) continue } @@ -325,6 +326,9 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { s.failedTGAllocs = make(map[string]*structs.AllocMetric) } + // Update metrics with the resources requested by the task group. + s.ctx.Metrics().ExhaustResources(missing.TaskGroup) + // Actual failure to start this task on this candidate node, report it individually s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics() s.addBlocked(node) @@ -417,7 +421,7 @@ func (s *SystemScheduler) addBlocked(node *structs.Node) error { classEligibility = e.GetClasses() } - blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached()) + blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached(), s.failedTGAllocs) blocked.StatusDescription = blockedEvalFailedPlacements blocked.NodeID = node.ID diff --git a/vendor/github.com/hashicorp/nomad/api/allocations.go b/vendor/github.com/hashicorp/nomad/api/allocations.go index 0c1a01b1afd6..8dc837b390e2 100644 --- a/vendor/github.com/hashicorp/nomad/api/allocations.go +++ b/vendor/github.com/hashicorp/nomad/api/allocations.go @@ -374,6 +374,7 @@ type AllocationMetric struct { ClassExhausted map[string]int DimensionExhausted map[string]int QuotaExhausted []string + ResourcesExhausted map[string]*Resources // Deprecated, replaced with ScoreMetaData Scores map[string]float64 AllocationTime time.Duration From 8f5b20cc1da0c7f95718272a7ed27d5bd46c6ab9 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 28 Apr 2021 00:58:56 -0400 Subject: [PATCH 2/3] docs: add new blocked_evals metrics --- website/content/docs/operations/metrics.mdx | 422 ++++++++++---------- 1 file changed, 213 insertions(+), 209 deletions(-) diff --git a/website/content/docs/operations/metrics.mdx b/website/content/docs/operations/metrics.mdx index 9c06e5e615b4..5c0314436627 100644 --- a/website/content/docs/operations/metrics.mdx +++ b/website/content/docs/operations/metrics.mdx @@ -228,214 +228,218 @@ Job status metrics are emitted by the Nomad leader server. The following table includes metrics for overall cluster health in addition to those listed in [Key Metrics](#key-metrics) above. -| Metric | Description | Unit | Type | Labels | -| ---------------------------------------------------- | ----------------------------------------------------------------- | -------------------- | ------- | ------ | -| `nomad.memberlist.gossip` | Time elapsed to broadcast gossip messages | Nanoseconds | Summary | host | -| `nomad.nomad.acl.bootstrap` | Time elapsed for `ACL.Bootstrap` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.delete_policies` | Time elapsed for `ACL.DeletePolicies` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.delete_tokens` | Time elapsed for `ACL.DeleteTokens` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.get_policies` | Time elapsed for `ACL.GetPolicies` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.get_policy` | Time elapsed for `ACL.GetPolicy` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.get_token` | Time elapsed for `ACL.GetToken` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.get_tokens` | Time elapsed for `ACL.GetTokens` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.list_policies` | Time elapsed for `ACL.ListPolicies` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.list_tokens` | Time elapsed for `ACL.ListTokens` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.resolve_token` | Time elapsed for `ACL.ResolveToken` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.upsert_policies` | Time elapsed for `ACL.UpsertPolicies` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.acl.upsert_tokens` | Time elapsed for `ACL.UpsertTokens` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.exec` | Time elapsed to establish alloc exec | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.get_alloc` | Time elapsed for `Alloc.GetAlloc` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.get_allocs` | Time elapsed for `Alloc.GetAllocs` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.list` | Time elapsed for `Alloc.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.stop` | Time elapsed for `Alloc.Stop` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.alloc.update_desired_transition` | Time elapsed for `Alloc.UpdateDesiredTransition` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.blocked_evals.total_blocked` | Count of evals in the blocked state | Integer | Gauge | host | -| `nomad.nomad.blocked_evals.total_escaped` | Count of evals that have escaped computed node classes | Integer | Gauge | host | -| `nomad.nomad.blocked_evals.total_quota_limit` | Count of blocked evals due to quota limits | Integer | Gauge | host | -| `nomad.nomad.broker.batch_ready` | Count of batch evals ready to be scheduled | Integer | Gauge | host | -| `nomad.nomad.broker.batch_unacked` | Count of unacknowledged batch evals | Integer | Gauge | host | -| `nomad.nomad.broker.service_ready` | Count of service evals ready to be scheduled | Integer | Gauge | host | -| `nomad.nomad.broker.service_unacked` | Count of unacknowledged service evals | Integer | Gauge | host | -| `nomad.nomad.broker.system_ready` | Count of system evals ready to be scheduled | Integer | Gauge | host | -| `nomad.nomad.broker.system_unacked` | Count of unacknowledged system evals | Integer | Gauge | host | -| `nomad.nomad.broker.total_ready` | Count of evals in the ready state | Integer | Gauge | host | -| `nomad.nomad.broker.total_waiting` | Count of evals in the waiting state | Integer | Gauge | host | -| `nomad.nomad.client.batch_deregister` | Time elapsed for `Node.BatchDeregister` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.deregister` | Time elapsed for `Node.Deregister` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.derive_si_token` | Time elapsed for `Node.DeriveSIToken` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.derive_vault_token` | Time elapsed for `Node.DeriveVaultToken` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.emit_events` | Time elapsed for `Node.EmitEvents` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.evaluate` | Time elapsed for `Node.Evaluate` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.get_allocs` | Time elapsed for `Node.GetAllocs` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.get_client_allocs` | Time elapsed for `Node.GetClientAllocs` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.get_node` | Time elapsed for `Node.GetNode` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.list` | Time elapsed for `Node.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.register` | Time elapsed for `Node.Register` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.stats` | Time elapsed for `Client.Stats` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.update_alloc` | Time elapsed for `Node.UpdateAlloc` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.update_drain` | Time elapsed for `Node.UpdateDrain` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.update_eligibility` | Time elapsed for `Node.UpdateEligibility` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client.update_status` | Time elapsed for `Node.UpdateStatus` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_allocations.garbage_collect_all` | Time elapsed for `ClientAllocations.GarbageCollectAll` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_allocations.garbage_collect` | Time elapsed for `ClientAllocations.GarbageCollect` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_allocations.restart` | Time elapsed for `ClientAllocations.Restart` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_allocations.signal` | Time elapsed for `ClientAllocations.Signal` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_allocations.stats` | Time elapsed for `ClientAllocations.Stats` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_csi_controller.attach_volume` | Time elapsed for `Controller.AttachVolume` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_csi_controller.detach_volume` | Time elapsed for `Controller.DetachVolume` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_csi_controller.validate_volume` | Time elapsed for `Controller.ValidateVolume` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.client_csi_node.detach_volume` | Time elapsed for `Node.DetachVolume` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.allocations` | Time elapsed for `Deployment.Allocations` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.cancel` | Time elapsed for `Deployment.Cancel` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.fail` | Time elapsed for `Deployment.Fail` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.get_deployment` | Time elapsed for `Deployment.GetDeployment` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.list` | Time elapsed for `Deployment.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.pause` | Time elapsed for `Deployment.Pause` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.promote` | Time elapsed for `Deployment.Promote` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.reap` | Time elapsed for `Deployment.Reap` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.run` | Time elapsed for `Deployment.Run` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.set_alloc_health` | Time elapsed for `Deployment.SetAllocHealth` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.deployment.unblock` | Time elapsed for `Deployment.Unblock` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.ack` | Time elapsed for `Eval.Ack` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.allocations` | Time elapsed for `Eval.Allocations` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.create` | Time elapsed for `Eval.Create` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.dequeue` | Time elapsed for `Eval.Dequeue` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.get_eval` | Time elapsed for `Eval.GetEval` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.list` | Time elapsed for `Eval.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.nack` | Time elapsed for `Eval.Nack` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.reap` | Time elapsed for `Eval.Reap` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.reblock` | Time elapsed for `Eval.Reblock` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.eval.update` | Time elapsed for `Eval.Update` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.file_system.list` | Time elapsed for `FileSystem.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.file_system.logs` | Time elapsed to establish `FileSystem.Logs` RPC | Nanoseconds | Summary | Host | -| `nomad.nomad.file_system.stat` | Time elapsed for `FileSystem.Stat` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.file_system.stream` | Time elapsed to establish `FileSystem.Stream` RPC | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.alloc_client_update` | Time elapsed to apply `AllocClientUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.alloc_update_desired_transition` | Time elapsed to apply `AllocUpdateDesiredTransition` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.alloc_update` | Time elapsed to apply `AllocUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_acl_policy_delete` | Time elapsed to apply `ApplyACLPolicyDelete` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_acl_policy_upsert` | Time elapsed to apply `ApplyACLPolicyUpsert` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_acl_token_bootstrap` | Time elapsed to apply `ApplyACLTokenBootstrap` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_acl_token_delete` | Time elapsed to apply `ApplyACLTokenDelete` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_acl_token_upsert` | Time elapsed to apply `ApplyACLTokenUpsert` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_csi_plugin_delete` | Time elapsed to apply `ApplyCSIPluginDelete` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_csi_volume_batch_claim` | Time elapsed to apply `ApplyCSIVolumeBatchClaim` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_csi_volume_claim` | Time elapsed to apply `ApplyCSIVolumeClaim` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_csi_volume_deregister` | Time elapsed to apply `ApplyCSIVolumeDeregister` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_csi_volume_register` | Time elapsed to apply `ApplyCSIVolumeRegister` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_deployment_alloc_health` | Time elapsed to apply `ApplyDeploymentAllocHealth` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_deployment_delete` | Time elapsed to apply `ApplyDeploymentDelete` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_deployment_promotion` | Time elapsed to apply `ApplyDeploymentPromotion` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_deployment_status_update` | Time elapsed to apply `ApplyDeploymentStatusUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_job_stability` | Time elapsed to apply `ApplyJobStability` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_namespace_delete` | Time elapsed to apply `ApplyNamespaceDelete` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_namespace_upsert` | Time elapsed to apply `ApplyNamespaceUpsert` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_plan_results` | Time elapsed to apply `ApplyPlanResults` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.apply_scheduler_config` | Time elapsed to apply `ApplySchedulerConfig` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.autopilot` | Time elapsed to apply `Autopilot` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.batch_deregister_job` | Time elapsed to apply `BatchDeregisterJob` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.batch_deregister_node` | Time elapsed to apply `BatchDeregisterNode` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.batch_node_drain_update` | Time elapsed to apply `BatchNodeDrainUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.cluster_meta` | Time elapsed to apply `ClusterMeta` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.delete_eval` | Time elapsed to apply `DeleteEval` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.deregister_job` | Time elapsed to apply `DeregisterJob` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.deregister_node` | Time elapsed to apply `DeregisterNode` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.deregister_si_accessor` | Time elapsed to apply `DeregisterSITokenAccessor` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.deregister_vault_accessor` | Time elapsed to apply `DeregisterVaultAccessor` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.node_drain_update` | Time elapsed to apply `NodeDrainUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.node_eligibility_update` | Time elapsed to apply `NodeEligibilityUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.node_status_update` | Time elapsed to apply `NodeStatusUpdate` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.persist` | Time elapsed to apply `Persist` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.register_job` | Time elapsed to apply `RegisterJob` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.register_node` | Time elapsed to apply `RegisterNode` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.update_eval` | Time elapsed to apply `UpdateEval` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.upsert_node_events` | Time elapsed to apply `UpsertNodeEvents` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.upsert_scaling_event` | Time elapsed to apply `UpsertScalingEvent` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.upsert_si_accessor` | Time elapsed to apply `UpsertSITokenAccessors` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.fsm.upsert_vault_accessor` | Time elapsed to apply `UpsertVaultAccessor` raft entry | Nanoseconds | Summary | Host | -| `nomad.nomad.job.allocations` | Time elapsed for `Job.Allocations` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.batch_deregister` | Time elapsed for `Job.BatchDeregister` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.deployments` | Time elapsed for `Job.Deployments` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.deregister` | Time elapsed for `Job.Deregister` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.dispatch` | Time elapsed for `Job.Dispatch` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.evaluate` | Time elapsed for `Job.Evaluate` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.evaluations` | Time elapsed for `Job.Evaluations` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.get_job_versions` | Time elapsed for `Job.GetJobVersions` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.get_job` | Time elapsed for `Job.GetJob` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.latest_deployment` | Time elapsed for `Job.LatestDeployment` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.list` | Time elapsed for `Job.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.plan` | Time elapsed for `Job.Plan` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.register` | Time elapsed for `Job.Register` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.revert` | Time elapsed for `Job.Revert` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.scale_status` | Time elapsed for `Job.ScaleStatus` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.scale` | Time elapsed for `Job.Scale` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.stable` | Time elapsed for `Job.Stable` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job.validate` | Time elapsed for `Job.Validate` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.job_summary.get_job_summary` | Time elapsed for `Job.Summary` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.leader.barrier` | Time elapsed to establish a raft barrier during leader transition | Nanoseconds | Summary | host | -| `nomad.nomad.leader.reconcileMember` | Time elapsed to reconcile a serf peer with state store | Nanoseconds | Summary | host | -| `nomad.nomad.leader.reconcile` | Time elapsed to reconcile all serf peers with state store | Nanoseconds | Summary | host | -| `nomad.nomad.namespace.delete_namespaces` | Time elapsed for `Namespace.DeleteNamespaces` | Nanoseconds | Summary | Host | -| `nomad.nomad.namespace.get_namespace` | Time elapsed for `Namespace.GetNamespace` | Nanoseconds | Summary | Host | -| `nomad.nomad.namespace.get_namespaces` | Time elapsed for `Namespace.GetNamespaces` | Nanoseconds | Summary | Host | -| `nomad.nomad.namespace.list_namespace` | Time elapsed for `Namespace.ListNamespaces` | Nanoseconds | Summary | Host | -| `nomad.nomad.namespace.upsert_namespaces` | Time elapsed for `Namespace.UpsertNamespaces` | Nanoseconds | Summary | Host | -| `nomad.nomad.periodic.force` | Time elapsed for `Periodic.Force` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.plan.apply` | Time elapsed to apply a plan | Nanoseconds | Summary | host | -| `nomad.nomad.plan.evaluate` | Time elapsed to evaluate a plan | Nanoseconds | Summary | host | -| `nomad.nomad.plan.queue_depth` | Count of evals in the plan queue | Integer | Gauge | host | -| `nomad.nomad.plan.submit` | Time elapsed for `Plan.Submit` RPC call | Nanoseconds | Summary | host | -| `nomad.nomad.plan.wait_for_index` | Time elapsed for the planner to obtain a snapshot | Nanoseconds | Summary | host | -| `nomad.nomad.plugin.delete` | Time elapsed for `CSIPlugin.Delete` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.plugin.get` | Time elapsed for `CSIPlugin.Get` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.plugin.list` | Time elapsed for `CSIPlugin.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.scaling.get_policy` | Time elapsed for `Scaling.GetPolicy` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.scaling.list_policies` | Time elapsed for `Scaling.ListPolicies` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.search.prefix_search` | Time elapsed for `Search.PrefixSearch` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.vault.create_token` | Time elapsed to create Vault token | Nanoseconds | Gauge | host | -| `nomad.nomad.vault.distributed_tokens_revoked` | Count of revoked tokens | Integer | Gauge | host | -| `nomad.nomad.vault.lookup_token` | Time elapsed to lookup Vault token | Nanoseconds | Gauge | host | -| `nomad.nomad.vault.renew_failed` | Count of failed attempts to renew Vault token | Integer | Gauge | host | -| `nomad.nomad.vault.renew` | Time elapsed to renew Vault token | Nanoseconds | Gauge | host | -| `nomad.nomad.vault.revoke_tokens` | Time elapsed to revoke Vault tokens | Nanoseconds | Gauge | host | -| `nomad.nomad.vault.token_ttl` | Time to live for Vault token | Integer | Gauge | host | -| `nomad.nomad.vault.undistributed_tokens_abandoned` | Count of abandoned tokens | Integer | Gauge | host | -| `nomad.nomad.volume.claim` | Time elapsed for `CSIVolume.Claim` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.volume.deregister` | Time elapsed for `CSIVolume.Deregister` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.volume.get` | Time elapsed for `CSIVolume.Get` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.volume.list` | Time elapsed for `CSIVolume.List` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.volume.register` | Time elapsed for `CSIVolume.Register` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.volume.unpublish` | Time elapsed for `CSIVolume.Unpublish` RPC call | Nanoseconds | Summary | Host | -| `nomad.nomad.worker.create_eval` | Time elapsed for worker to create an eval | Nanoseconds | Summary | host | -| `nomad.nomad.worker.dequeue_eval` | Time elapsed for worker to dequeue an eval | Nanoseconds | Summary | host | -| `nomad.nomad.worker.invoke_scheduler_service` | Time elapsed for worker to invoke the scheduler | Nanoseconds | Summary | host | -| `nomad.nomad.worker.send_ack` | Time elapsed for worker to send acknowledgement | Nanoseconds | Summary | host | -| `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Nanoseconds | Summary | host | -| `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Nanoseconds | Summary | host | -| `nomad.nomad.worker.wait_for_index` | Time elapsed for worker get snapshot | Nanoseconds | Summary | host | -| `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host | -| `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host | -| `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host | -| `nomad.raft.commitTime` | Time elapsed to commit writes | Nanoseconds | Summary | host | -| `nomad.raft.fsm.apply` | Time elapsed to apply write to FSM | Nanoseconds | Summary | host | -| `nomad.raft.fsm.enqueue` | Time elapsed to enqueue write to FSM | Nanoseconds | Summary | host | -| `nomad.raft.lastIndex` | Most recent index seen | Integer | Gauge | host | -| `nomad.raft.leader.dispatchLog` | Time elapsed to write log, mark in flight, and start replication | Nanoseconds | Summary | host | -| `nomad.raft.leader.dispatchNumLogs` | Count of logs dispatched | Integer | Gauge | host | -| `nomad.raft.replication.appendEntries` | Raft transaction commit time | ms / Raft Log Append | Timer | | -| `nomad.raft.state.candidate` | Count of entering candidate state | Integer | Gauge | host | -| `nomad.raft.state.follower` | Count of entering follower state | Integer | Gauge | host | -| `nomad.raft.state.leader` | Count of entering leader state | Integer | Gauge | host | -| `nomad.raft.transition.heartbeat_timeout` | Count of failing to heartbeat and starting election | Integer | Gauge | host | -| `nomad.raft.transition.leader_lease_timeout` | Count of stepping down as leader after losing quorum | Integer | Gauge | host | -| `nomad.runtime.free_count` | Count of objects freed from heap by go runtime GC | Integer | Gauge | host | -| `nomad.runtime.gc_pause_ns` | Go runtime GC pause times | Nanoseconds | Summary | host | -| `nomad.runtime.sys_bytes` | Go runtime GC metadata size | # of bytes | Gauge | host | -| `nomad.runtime.total_gc_pause_ns` | Total elapsed go runtime GC pause times | Nanoseconds | Gauge | host | -| `nomad.runtime.total_gc_runs` | Count of go runtime GC runs | Integer | Gauge | host | -| `nomad.serf.queue.Event` | Count of memberlist events received | Integer | Summary | host | -| `nomad.serf.queue.Intent` | Count of memberlist changes | Integer | Summary | host | -| `nomad.serf.queue.Query` | Count of memberlist queries | Integer | Summary | host | -| `nomad.state.snapshotIndex` | Current snapshot index | Integer | Gauge | host | +| Metric | Description | Unit | Type | Labels | +| ---------------------------------------------------- | ----------------------------------------------------------------- | -------------------- | ------- | ---------------------------- | +| `nomad.memberlist.gossip` | Time elapsed to broadcast gossip messages | Nanoseconds | Summary | host | +| `nomad.nomad.acl.bootstrap` | Time elapsed for `ACL.Bootstrap` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.delete_policies` | Time elapsed for `ACL.DeletePolicies` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.delete_tokens` | Time elapsed for `ACL.DeleteTokens` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.get_policies` | Time elapsed for `ACL.GetPolicies` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.get_policy` | Time elapsed for `ACL.GetPolicy` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.get_token` | Time elapsed for `ACL.GetToken` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.get_tokens` | Time elapsed for `ACL.GetTokens` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.list_policies` | Time elapsed for `ACL.ListPolicies` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.list_tokens` | Time elapsed for `ACL.ListTokens` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.resolve_token` | Time elapsed for `ACL.ResolveToken` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.upsert_policies` | Time elapsed for `ACL.UpsertPolicies` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.acl.upsert_tokens` | Time elapsed for `ACL.UpsertTokens` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.exec` | Time elapsed to establish alloc exec | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.get_alloc` | Time elapsed for `Alloc.GetAlloc` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.get_allocs` | Time elapsed for `Alloc.GetAllocs` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.list` | Time elapsed for `Alloc.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.stop` | Time elapsed for `Alloc.Stop` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.alloc.update_desired_transition` | Time elapsed for `Alloc.UpdateDesiredTransition` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class | +| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class | +| `nomad.nomad.blocked_evals.job.cpu` | Amount of CPU shares requested by blocked evals of a job | Integer | Gauge | host, job, namespace | +| `nomad.nomad.blocked_evals.job.memory` | Amount of memory requested by blocked evals of a job | Integer | Gauge | host, job, namespace | +| `nomad.nomad.blocked_evals.total_blocked` | Count of evals in the blocked state | Integer | Gauge | host | +| `nomad.nomad.blocked_evals.total_escaped` | Count of evals that have escaped computed node classes | Integer | Gauge | host | +| `nomad.nomad.blocked_evals.total_quota_limit` | Count of blocked evals due to quota limits | Integer | Gauge | host | +| `nomad.nomad.broker.batch_ready` | Count of batch evals ready to be scheduled | Integer | Gauge | host | +| `nomad.nomad.broker.batch_unacked` | Count of unacknowledged batch evals | Integer | Gauge | host | +| `nomad.nomad.broker.service_ready` | Count of service evals ready to be scheduled | Integer | Gauge | host | +| `nomad.nomad.broker.service_unacked` | Count of unacknowledged service evals | Integer | Gauge | host | +| `nomad.nomad.broker.system_ready` | Count of system evals ready to be scheduled | Integer | Gauge | host | +| `nomad.nomad.broker.system_unacked` | Count of unacknowledged system evals | Integer | Gauge | host | +| `nomad.nomad.broker.total_ready` | Count of evals in the ready state | Integer | Gauge | host | +| `nomad.nomad.broker.total_waiting` | Count of evals in the waiting state | Integer | Gauge | host | +| `nomad.nomad.client.batch_deregister` | Time elapsed for `Node.BatchDeregister` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.deregister` | Time elapsed for `Node.Deregister` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.derive_si_token` | Time elapsed for `Node.DeriveSIToken` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.derive_vault_token` | Time elapsed for `Node.DeriveVaultToken` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.emit_events` | Time elapsed for `Node.EmitEvents` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.evaluate` | Time elapsed for `Node.Evaluate` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.get_allocs` | Time elapsed for `Node.GetAllocs` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.get_client_allocs` | Time elapsed for `Node.GetClientAllocs` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.get_node` | Time elapsed for `Node.GetNode` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.list` | Time elapsed for `Node.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.register` | Time elapsed for `Node.Register` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.stats` | Time elapsed for `Client.Stats` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.update_alloc` | Time elapsed for `Node.UpdateAlloc` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.update_drain` | Time elapsed for `Node.UpdateDrain` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.update_eligibility` | Time elapsed for `Node.UpdateEligibility` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client.update_status` | Time elapsed for `Node.UpdateStatus` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_allocations.garbage_collect_all` | Time elapsed for `ClientAllocations.GarbageCollectAll` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_allocations.garbage_collect` | Time elapsed for `ClientAllocations.GarbageCollect` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_allocations.restart` | Time elapsed for `ClientAllocations.Restart` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_allocations.signal` | Time elapsed for `ClientAllocations.Signal` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_allocations.stats` | Time elapsed for `ClientAllocations.Stats` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_csi_controller.attach_volume` | Time elapsed for `Controller.AttachVolume` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_csi_controller.detach_volume` | Time elapsed for `Controller.DetachVolume` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_csi_controller.validate_volume` | Time elapsed for `Controller.ValidateVolume` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.client_csi_node.detach_volume` | Time elapsed for `Node.DetachVolume` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.allocations` | Time elapsed for `Deployment.Allocations` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.cancel` | Time elapsed for `Deployment.Cancel` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.fail` | Time elapsed for `Deployment.Fail` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.get_deployment` | Time elapsed for `Deployment.GetDeployment` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.list` | Time elapsed for `Deployment.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.pause` | Time elapsed for `Deployment.Pause` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.promote` | Time elapsed for `Deployment.Promote` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.reap` | Time elapsed for `Deployment.Reap` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.run` | Time elapsed for `Deployment.Run` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.set_alloc_health` | Time elapsed for `Deployment.SetAllocHealth` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.deployment.unblock` | Time elapsed for `Deployment.Unblock` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.ack` | Time elapsed for `Eval.Ack` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.allocations` | Time elapsed for `Eval.Allocations` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.create` | Time elapsed for `Eval.Create` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.dequeue` | Time elapsed for `Eval.Dequeue` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.get_eval` | Time elapsed for `Eval.GetEval` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.list` | Time elapsed for `Eval.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.nack` | Time elapsed for `Eval.Nack` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.reap` | Time elapsed for `Eval.Reap` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.reblock` | Time elapsed for `Eval.Reblock` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.eval.update` | Time elapsed for `Eval.Update` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.file_system.list` | Time elapsed for `FileSystem.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.file_system.logs` | Time elapsed to establish `FileSystem.Logs` RPC | Nanoseconds | Summary | Host | +| `nomad.nomad.file_system.stat` | Time elapsed for `FileSystem.Stat` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.file_system.stream` | Time elapsed to establish `FileSystem.Stream` RPC | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.alloc_client_update` | Time elapsed to apply `AllocClientUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.alloc_update_desired_transition` | Time elapsed to apply `AllocUpdateDesiredTransition` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.alloc_update` | Time elapsed to apply `AllocUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_acl_policy_delete` | Time elapsed to apply `ApplyACLPolicyDelete` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_acl_policy_upsert` | Time elapsed to apply `ApplyACLPolicyUpsert` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_acl_token_bootstrap` | Time elapsed to apply `ApplyACLTokenBootstrap` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_acl_token_delete` | Time elapsed to apply `ApplyACLTokenDelete` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_acl_token_upsert` | Time elapsed to apply `ApplyACLTokenUpsert` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_csi_plugin_delete` | Time elapsed to apply `ApplyCSIPluginDelete` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_csi_volume_batch_claim` | Time elapsed to apply `ApplyCSIVolumeBatchClaim` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_csi_volume_claim` | Time elapsed to apply `ApplyCSIVolumeClaim` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_csi_volume_deregister` | Time elapsed to apply `ApplyCSIVolumeDeregister` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_csi_volume_register` | Time elapsed to apply `ApplyCSIVolumeRegister` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_deployment_alloc_health` | Time elapsed to apply `ApplyDeploymentAllocHealth` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_deployment_delete` | Time elapsed to apply `ApplyDeploymentDelete` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_deployment_promotion` | Time elapsed to apply `ApplyDeploymentPromotion` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_deployment_status_update` | Time elapsed to apply `ApplyDeploymentStatusUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_job_stability` | Time elapsed to apply `ApplyJobStability` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_namespace_delete` | Time elapsed to apply `ApplyNamespaceDelete` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_namespace_upsert` | Time elapsed to apply `ApplyNamespaceUpsert` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_plan_results` | Time elapsed to apply `ApplyPlanResults` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.apply_scheduler_config` | Time elapsed to apply `ApplySchedulerConfig` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.autopilot` | Time elapsed to apply `Autopilot` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.batch_deregister_job` | Time elapsed to apply `BatchDeregisterJob` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.batch_deregister_node` | Time elapsed to apply `BatchDeregisterNode` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.batch_node_drain_update` | Time elapsed to apply `BatchNodeDrainUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.cluster_meta` | Time elapsed to apply `ClusterMeta` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.delete_eval` | Time elapsed to apply `DeleteEval` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.deregister_job` | Time elapsed to apply `DeregisterJob` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.deregister_node` | Time elapsed to apply `DeregisterNode` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.deregister_si_accessor` | Time elapsed to apply `DeregisterSITokenAccessor` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.deregister_vault_accessor` | Time elapsed to apply `DeregisterVaultAccessor` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.node_drain_update` | Time elapsed to apply `NodeDrainUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.node_eligibility_update` | Time elapsed to apply `NodeEligibilityUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.node_status_update` | Time elapsed to apply `NodeStatusUpdate` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.persist` | Time elapsed to apply `Persist` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.register_job` | Time elapsed to apply `RegisterJob` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.register_node` | Time elapsed to apply `RegisterNode` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.update_eval` | Time elapsed to apply `UpdateEval` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.upsert_node_events` | Time elapsed to apply `UpsertNodeEvents` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.upsert_scaling_event` | Time elapsed to apply `UpsertScalingEvent` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.upsert_si_accessor` | Time elapsed to apply `UpsertSITokenAccessors` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.fsm.upsert_vault_accessor` | Time elapsed to apply `UpsertVaultAccessor` raft entry | Nanoseconds | Summary | Host | +| `nomad.nomad.job.allocations` | Time elapsed for `Job.Allocations` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.batch_deregister` | Time elapsed for `Job.BatchDeregister` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.deployments` | Time elapsed for `Job.Deployments` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.deregister` | Time elapsed for `Job.Deregister` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.dispatch` | Time elapsed for `Job.Dispatch` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.evaluate` | Time elapsed for `Job.Evaluate` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.evaluations` | Time elapsed for `Job.Evaluations` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.get_job_versions` | Time elapsed for `Job.GetJobVersions` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.get_job` | Time elapsed for `Job.GetJob` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.latest_deployment` | Time elapsed for `Job.LatestDeployment` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.list` | Time elapsed for `Job.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.plan` | Time elapsed for `Job.Plan` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.register` | Time elapsed for `Job.Register` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.revert` | Time elapsed for `Job.Revert` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.scale_status` | Time elapsed for `Job.ScaleStatus` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.scale` | Time elapsed for `Job.Scale` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.stable` | Time elapsed for `Job.Stable` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job.validate` | Time elapsed for `Job.Validate` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.job_summary.get_job_summary` | Time elapsed for `Job.Summary` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.leader.barrier` | Time elapsed to establish a raft barrier during leader transition | Nanoseconds | Summary | host | +| `nomad.nomad.leader.reconcileMember` | Time elapsed to reconcile a serf peer with state store | Nanoseconds | Summary | host | +| `nomad.nomad.leader.reconcile` | Time elapsed to reconcile all serf peers with state store | Nanoseconds | Summary | host | +| `nomad.nomad.namespace.delete_namespaces` | Time elapsed for `Namespace.DeleteNamespaces` | Nanoseconds | Summary | Host | +| `nomad.nomad.namespace.get_namespace` | Time elapsed for `Namespace.GetNamespace` | Nanoseconds | Summary | Host | +| `nomad.nomad.namespace.get_namespaces` | Time elapsed for `Namespace.GetNamespaces` | Nanoseconds | Summary | Host | +| `nomad.nomad.namespace.list_namespace` | Time elapsed for `Namespace.ListNamespaces` | Nanoseconds | Summary | Host | +| `nomad.nomad.namespace.upsert_namespaces` | Time elapsed for `Namespace.UpsertNamespaces` | Nanoseconds | Summary | Host | +| `nomad.nomad.periodic.force` | Time elapsed for `Periodic.Force` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.plan.apply` | Time elapsed to apply a plan | Nanoseconds | Summary | host | +| `nomad.nomad.plan.evaluate` | Time elapsed to evaluate a plan | Nanoseconds | Summary | host | +| `nomad.nomad.plan.queue_depth` | Count of evals in the plan queue | Integer | Gauge | host | +| `nomad.nomad.plan.submit` | Time elapsed for `Plan.Submit` RPC call | Nanoseconds | Summary | host | +| `nomad.nomad.plan.wait_for_index` | Time elapsed for the planner to obtain a snapshot | Nanoseconds | Summary | host | +| `nomad.nomad.plugin.delete` | Time elapsed for `CSIPlugin.Delete` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.plugin.get` | Time elapsed for `CSIPlugin.Get` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.plugin.list` | Time elapsed for `CSIPlugin.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.scaling.get_policy` | Time elapsed for `Scaling.GetPolicy` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.scaling.list_policies` | Time elapsed for `Scaling.ListPolicies` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.search.prefix_search` | Time elapsed for `Search.PrefixSearch` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.vault.create_token` | Time elapsed to create Vault token | Nanoseconds | Gauge | host | +| `nomad.nomad.vault.distributed_tokens_revoked` | Count of revoked tokens | Integer | Gauge | host | +| `nomad.nomad.vault.lookup_token` | Time elapsed to lookup Vault token | Nanoseconds | Gauge | host | +| `nomad.nomad.vault.renew_failed` | Count of failed attempts to renew Vault token | Integer | Gauge | host | +| `nomad.nomad.vault.renew` | Time elapsed to renew Vault token | Nanoseconds | Gauge | host | +| `nomad.nomad.vault.revoke_tokens` | Time elapsed to revoke Vault tokens | Nanoseconds | Gauge | host | +| `nomad.nomad.vault.token_ttl` | Time to live for Vault token | Integer | Gauge | host | +| `nomad.nomad.vault.undistributed_tokens_abandoned` | Count of abandoned tokens | Integer | Gauge | host | +| `nomad.nomad.volume.claim` | Time elapsed for `CSIVolume.Claim` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.volume.deregister` | Time elapsed for `CSIVolume.Deregister` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.volume.get` | Time elapsed for `CSIVolume.Get` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.volume.list` | Time elapsed for `CSIVolume.List` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.volume.register` | Time elapsed for `CSIVolume.Register` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.volume.unpublish` | Time elapsed for `CSIVolume.Unpublish` RPC call | Nanoseconds | Summary | Host | +| `nomad.nomad.worker.create_eval` | Time elapsed for worker to create an eval | Nanoseconds | Summary | host | +| `nomad.nomad.worker.dequeue_eval` | Time elapsed for worker to dequeue an eval | Nanoseconds | Summary | host | +| `nomad.nomad.worker.invoke_scheduler_service` | Time elapsed for worker to invoke the scheduler | Nanoseconds | Summary | host | +| `nomad.nomad.worker.send_ack` | Time elapsed for worker to send acknowledgement | Nanoseconds | Summary | host | +| `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Nanoseconds | Summary | host | +| `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Nanoseconds | Summary | host | +| `nomad.nomad.worker.wait_for_index` | Time elapsed for worker get snapshot | Nanoseconds | Summary | host | +| `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host | +| `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host | +| `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host | +| `nomad.raft.commitTime` | Time elapsed to commit writes | Nanoseconds | Summary | host | +| `nomad.raft.fsm.apply` | Time elapsed to apply write to FSM | Nanoseconds | Summary | host | +| `nomad.raft.fsm.enqueue` | Time elapsed to enqueue write to FSM | Nanoseconds | Summary | host | +| `nomad.raft.lastIndex` | Most recent index seen | Integer | Gauge | host | +| `nomad.raft.leader.dispatchLog` | Time elapsed to write log, mark in flight, and start replication | Nanoseconds | Summary | host | +| `nomad.raft.leader.dispatchNumLogs` | Count of logs dispatched | Integer | Gauge | host | +| `nomad.raft.replication.appendEntries` | Raft transaction commit time | ms / Raft Log Append | Timer | | +| `nomad.raft.state.candidate` | Count of entering candidate state | Integer | Gauge | host | +| `nomad.raft.state.follower` | Count of entering follower state | Integer | Gauge | host | +| `nomad.raft.state.leader` | Count of entering leader state | Integer | Gauge | host | +| `nomad.raft.transition.heartbeat_timeout` | Count of failing to heartbeat and starting election | Integer | Gauge | host | +| `nomad.raft.transition.leader_lease_timeout` | Count of stepping down as leader after losing quorum | Integer | Gauge | host | +| `nomad.runtime.free_count` | Count of objects freed from heap by go runtime GC | Integer | Gauge | host | +| `nomad.runtime.gc_pause_ns` | Go runtime GC pause times | Nanoseconds | Summary | host | +| `nomad.runtime.sys_bytes` | Go runtime GC metadata size | # of bytes | Gauge | host | +| `nomad.runtime.total_gc_pause_ns` | Total elapsed go runtime GC pause times | Nanoseconds | Gauge | host | +| `nomad.runtime.total_gc_runs` | Count of go runtime GC runs | Integer | Gauge | host | +| `nomad.serf.queue.Event` | Count of memberlist events received | Integer | Summary | host | +| `nomad.serf.queue.Intent` | Count of memberlist changes | Integer | Summary | host | +| `nomad.serf.queue.Query` | Count of memberlist queries | Integer | Summary | host | +| `nomad.state.snapshotIndex` | Current snapshot index | Integer | Gauge | host | [tagged-metrics]: /docs/telemetry/metrics#tagged-metrics From 9e36d13e79cb7fc3f2d518ae7665aff540187140 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 28 Apr 2021 17:18:51 -0400 Subject: [PATCH 3/3] fix to call `pruneStats` instead of `stats.prune` directly --- nomad/blocked_evals.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/blocked_evals.go b/nomad/blocked_evals.go index 4359bf7e5a0b..edcb8e08d23a 100644 --- a/nomad/blocked_evals.go +++ b/nomad/blocked_evals.go @@ -749,7 +749,7 @@ func (b *BlockedEvals) prune(stopCh <-chan struct{}) { case t := <-ticker.C: cutoff := t.UTC().Add(-1 * pruneThreshold) b.pruneUnblockIndexes(cutoff) - b.stats.prune(cutoff) + b.pruneStats(cutoff) } } }