Skip to content

Commit

Permalink
Merge pull request #13786 from hashicorp/b-metrics-for-classless-bloc…
Browse files Browse the repository at this point in the history
…ked-evals

metrics: classless blocked evals get metrics
  • Loading branch information
shoenig committed Jul 18, 2022
2 parents cd047cd + 582a8a9 commit 8e25502
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 47 deletions.
3 changes: 3 additions & 0 deletions .changelog/13786.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
metrics: Fixed a bug where blocked evals with no class produced no dc:class scope metrics
```
10 changes: 5 additions & 5 deletions nomad/blocked_evals.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -31,7 +31,7 @@ const (
// failed allocation becomes available.
type BlockedEvals struct {
// logger is the logger to use by the blocked eval tracker.
logger log.Logger
logger hclog.Logger

evalBroker *EvalBroker
enabled bool
Expand Down Expand Up @@ -96,7 +96,7 @@ type wrappedEval struct {

// NewBlockedEvals creates a new blocked eval tracker that will enqueue
// unblocked evals into the passed broker.
func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals {
func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals {
return &BlockedEvals{
logger: logger.Named("blocked_evals"),
evalBroker: evalBroker,
Expand Down Expand Up @@ -727,7 +727,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels)
}

for k, v := range stats.BlockedResources.ByNode {
for k, v := range stats.BlockedResources.ByClassInDC {
labels := []metrics.Label{
{Name: "datacenter", Value: k.dc},
{Name: "node_class", Value: k.class},
Expand Down
42 changes: 23 additions & 19 deletions nomad/blocked_evals_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ type BlockedStats struct {
BlockedResources *BlockedResourcesStats
}

// node stores information related to nodes.
type node struct {
// classInDC is a coordinate of a specific class in a specific datacenter
type classInDC struct {
dc string
class string
}
Expand Down Expand Up @@ -65,9 +65,9 @@ func (b *BlockedStats) prune(cutoff time.Time) {
}
}

for k, v := range b.BlockedResources.ByNode {
for k, v := range b.BlockedResources.ByClassInDC {
if shouldPrune(v) {
delete(b.BlockedResources.ByNode, k)
delete(b.BlockedResources.ByClassInDC, k)
}
}
}
Expand All @@ -89,6 +89,10 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
for class := range allocMetrics.ClassExhausted {
classes[class] = struct{}{}
}
if len(allocMetrics.ClassExhausted) == 0 {
// some evaluations have no class
classes[""] = struct{}{}
}
for _, r := range allocMetrics.ResourcesExhausted {
resources.CPU += r.CPU
resources.MemoryMB += r.MemoryMB
Expand All @@ -99,32 +103,32 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
nsID := structs.NewNamespacedID(eval.JobID, eval.Namespace)
byJob[nsID] = resources

byNodeInfo := make(map[node]BlockedResourcesSummary)
byClassInDC := make(map[classInDC]BlockedResourcesSummary)
for dc := range dcs {
for class := range classes {
k := node{dc: dc, class: class}
byNodeInfo[k] = resources
k := classInDC{dc: dc, class: class}
byClassInDC[k] = resources
}
}

return &BlockedResourcesStats{
ByJob: byJob,
ByNode: byNodeInfo,
ByJob: byJob,
ByClassInDC: byClassInDC,
}
}

// BlockedResourcesStats stores resources requested by blocked evaluations,
// tracked both by job and by node.
type BlockedResourcesStats struct {
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByNode map[node]BlockedResourcesSummary
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByClassInDC map[classInDC]BlockedResourcesSummary
}

// NewBlockedResourcesStats returns a new BlockedResourcesStats.
func NewBlockedResourcesStats() *BlockedResourcesStats {
return &BlockedResourcesStats{
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByNode: make(map[node]BlockedResourcesSummary),
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByClassInDC: make(map[classInDC]BlockedResourcesSummary),
}
}

Expand All @@ -136,8 +140,8 @@ func (b *BlockedResourcesStats) Copy() *BlockedResourcesStats {
result.ByJob[k] = v // value copy
}

for k, v := range b.ByNode {
result.ByNode[k] = v // value copy
for k, v := range b.ByClassInDC {
result.ByClassInDC[k] = v // value copy
}

return result
Expand All @@ -152,8 +156,8 @@ func (b *BlockedResourcesStats) Add(a *BlockedResourcesStats) *BlockedResourcesS
result.ByJob[k] = b.ByJob[k].Add(v)
}

for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Add(v)
for k, v := range a.ByClassInDC {
result.ByClassInDC[k] = b.ByClassInDC[k].Add(v)
}

return result
Expand All @@ -168,8 +172,8 @@ func (b *BlockedResourcesStats) Subtract(a *BlockedResourcesStats) *BlockedResou
result.ByJob[k] = b.ByJob[k].Subtract(v)
}

for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Subtract(v)
for k, v := range a.ByClassInDC {
result.ByClassInDC[k] = b.ByClassInDC[k].Subtract(v)
}

return result
Expand Down
62 changes: 39 additions & 23 deletions nomad/blocked_evals_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ func TestBlockedResourceStats_New(t *testing.T) {
a := NewBlockedResourcesStats()
require.NotNil(t, a.ByJob)
require.Empty(t, a.ByJob)
require.NotNil(t, a.ByNode)
require.Empty(t, a.ByNode)
require.NotNil(t, a.ByClassInDC)
require.Empty(t, a.ByClassInDC)
}

var (
Expand All @@ -143,15 +143,20 @@ var (
Namespace: "two",
}

node1 = node{
node1 = classInDC{
dc: "dc1",
class: "alpha",
}

node2 = node{
node2 = classInDC{
dc: "dc1",
class: "beta",
}

node3 = classInDC{
dc: "dc1",
class: "", // not set
}
)

func TestBlockedResourceStats_Copy(t *testing.T) {
Expand All @@ -166,7 +171,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
MemoryMB: 256,
},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {
Timestamp: now1,
CPU: 300,
Expand All @@ -180,23 +185,23 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
CPU: 888,
MemoryMB: 888,
}
c.ByNode[node1] = BlockedResourcesSummary{
c.ByClassInDC[node1] = BlockedResourcesSummary{
Timestamp: now2,
CPU: 999,
MemoryMB: 999,
}

// underlying data should have been deep copied
require.Equal(t, 100, a.ByJob[id1].CPU)
require.Equal(t, 300, a.ByNode[node1].CPU)
require.Equal(t, 300, a.ByClassInDC[node1].CPU)
}

func TestBlockedResourcesStats_Add(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
id1: {Timestamp: now(1), CPU: 111, MemoryMB: 222},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(2), CPU: 333, MemoryMB: 444},
}

Expand All @@ -205,7 +210,7 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id1: {Timestamp: now(3), CPU: 200, MemoryMB: 300},
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}
b.ByNode = map[node]BlockedResourcesSummary{
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(5), CPU: 600, MemoryMB: 700},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}
Expand All @@ -218,10 +223,10 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}, result.ByJob)

require.Equal(t, map[node]BlockedResourcesSummary{
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(5), CPU: 933, MemoryMB: 1144},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}, result.ByNode)
}, result.ByClassInDC)
})

// make sure we handle zeros in both directions
Expand All @@ -233,20 +238,31 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}, result.ByJob)

require.Equal(t, map[node]BlockedResourcesSummary{
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(2), CPU: 933, MemoryMB: 1144},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}, result.ByNode)
}, result.ByClassInDC)
})
}

func TestBlockedResourcesStats_Add_NoClass(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node3: {Timestamp: now(1), CPU: 111, MemoryMB: 1111},
}
result := a.Add(a)
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node3: {Timestamp: now(1), CPU: 222, MemoryMB: 2222},
}, result.ByClassInDC)
}

func TestBlockedResourcesStats_Subtract(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
id1: {Timestamp: now(1), CPU: 100, MemoryMB: 100},
id2: {Timestamp: now(2), CPU: 200, MemoryMB: 200},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(3), CPU: 300, MemoryMB: 300},
node2: {Timestamp: now(4), CPU: 400, MemoryMB: 400},
}
Expand All @@ -256,7 +272,7 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
id1: {Timestamp: now(5), CPU: 10, MemoryMB: 11},
id2: {Timestamp: now(6), CPU: 12, MemoryMB: 13},
}
b.ByNode = map[node]BlockedResourcesSummary{
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(7), CPU: 14, MemoryMB: 15},
node2: {Timestamp: now(8), CPU: 16, MemoryMB: 17},
}
Expand All @@ -274,14 +290,14 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
require.Equal(t, 187, result.ByJob[id2].MemoryMB)

// node1
require.Equal(t, now(7), result.ByNode[node1].Timestamp)
require.Equal(t, 286, result.ByNode[node1].CPU)
require.Equal(t, 285, result.ByNode[node1].MemoryMB)
require.Equal(t, now(7), result.ByClassInDC[node1].Timestamp)
require.Equal(t, 286, result.ByClassInDC[node1].CPU)
require.Equal(t, 285, result.ByClassInDC[node1].MemoryMB)

// node2
require.Equal(t, now(8), result.ByNode[node2].Timestamp)
require.Equal(t, 384, result.ByNode[node2].CPU)
require.Equal(t, 383, result.ByNode[node2].MemoryMB)
require.Equal(t, now(8), result.ByClassInDC[node2].Timestamp)
require.Equal(t, 384, result.ByClassInDC[node2].CPU)
require.Equal(t, 383, result.ByClassInDC[node2].MemoryMB)
}

// testBlockedEvalsRandomBlockedEval wraps an eval that is randomly generated.
Expand Down Expand Up @@ -363,9 +379,9 @@ func clearTimestampFromBlockedResourceStats(b *BlockedResourcesStats) {
v.Timestamp = time.Time{}
b.ByJob[k] = v
}
for k, v := range b.ByNode {
for k, v := range b.ByClassInDC {
v.Timestamp = time.Time{}
b.ByNode[k] = v
b.ByClassInDC[k] = v
}
}

Expand Down

0 comments on commit 8e25502

Please sign in to comment.