Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: classless blocked evals get metrics #13786

Merged
merged 1 commit into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/13786.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
metrics: Fixed a bug where blocked evals with no class produced no dc:class scope metrics
```
10 changes: 5 additions & 5 deletions nomad/blocked_evals.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import (
"sync"
"time"

metrics "github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
Expand All @@ -31,7 +31,7 @@ const (
// failed allocation becomes available.
type BlockedEvals struct {
// logger is the logger to use by the blocked eval tracker.
logger log.Logger
logger hclog.Logger

evalBroker *EvalBroker
enabled bool
Expand Down Expand Up @@ -96,7 +96,7 @@ type wrappedEval struct {

// NewBlockedEvals creates a new blocked eval tracker that will enqueue
// unblocked evals into the passed broker.
func NewBlockedEvals(evalBroker *EvalBroker, logger log.Logger) *BlockedEvals {
func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals {
return &BlockedEvals{
logger: logger.Named("blocked_evals"),
evalBroker: evalBroker,
Expand Down Expand Up @@ -727,7 +727,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels)
}

for k, v := range stats.BlockedResources.ByNode {
for k, v := range stats.BlockedResources.ByClassInDC {
labels := []metrics.Label{
{Name: "datacenter", Value: k.dc},
{Name: "node_class", Value: k.class},
Expand Down
42 changes: 23 additions & 19 deletions nomad/blocked_evals_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ type BlockedStats struct {
BlockedResources *BlockedResourcesStats
}

// node stores information related to nodes.
type node struct {
// classInDC is a coordinate of a specific class in a specific datacenter
type classInDC struct {
dc string
class string
}
Expand Down Expand Up @@ -65,9 +65,9 @@ func (b *BlockedStats) prune(cutoff time.Time) {
}
}

for k, v := range b.BlockedResources.ByNode {
for k, v := range b.BlockedResources.ByClassInDC {
if shouldPrune(v) {
delete(b.BlockedResources.ByNode, k)
delete(b.BlockedResources.ByClassInDC, k)
}
}
}
Expand All @@ -89,6 +89,10 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
for class := range allocMetrics.ClassExhausted {
classes[class] = struct{}{}
}
if len(allocMetrics.ClassExhausted) == 0 {
// some evaluations have no class
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rude 😂

classes[""] = struct{}{}
}
for _, r := range allocMetrics.ResourcesExhausted {
resources.CPU += r.CPU
resources.MemoryMB += r.MemoryMB
Expand All @@ -99,32 +103,32 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
nsID := structs.NewNamespacedID(eval.JobID, eval.Namespace)
byJob[nsID] = resources

byNodeInfo := make(map[node]BlockedResourcesSummary)
byClassInDC := make(map[classInDC]BlockedResourcesSummary)
for dc := range dcs {
for class := range classes {
k := node{dc: dc, class: class}
byNodeInfo[k] = resources
k := classInDC{dc: dc, class: class}
byClassInDC[k] = resources
}
}

return &BlockedResourcesStats{
ByJob: byJob,
ByNode: byNodeInfo,
ByJob: byJob,
ByClassInDC: byClassInDC,
}
}

// BlockedResourcesStats stores resources requested by blocked evaluations,
// tracked both by job and by node.
type BlockedResourcesStats struct {
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByNode map[node]BlockedResourcesSummary
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByClassInDC map[classInDC]BlockedResourcesSummary
}

// NewBlockedResourcesStats returns a new BlockedResourcesStats.
func NewBlockedResourcesStats() *BlockedResourcesStats {
return &BlockedResourcesStats{
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByNode: make(map[node]BlockedResourcesSummary),
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByClassInDC: make(map[classInDC]BlockedResourcesSummary),
}
}

Expand All @@ -136,8 +140,8 @@ func (b *BlockedResourcesStats) Copy() *BlockedResourcesStats {
result.ByJob[k] = v // value copy
}

for k, v := range b.ByNode {
result.ByNode[k] = v // value copy
for k, v := range b.ByClassInDC {
result.ByClassInDC[k] = v // value copy
}

return result
Expand All @@ -152,8 +156,8 @@ func (b *BlockedResourcesStats) Add(a *BlockedResourcesStats) *BlockedResourcesS
result.ByJob[k] = b.ByJob[k].Add(v)
}

for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Add(v)
for k, v := range a.ByClassInDC {
result.ByClassInDC[k] = b.ByClassInDC[k].Add(v)
}

return result
Expand All @@ -168,8 +172,8 @@ func (b *BlockedResourcesStats) Subtract(a *BlockedResourcesStats) *BlockedResou
result.ByJob[k] = b.ByJob[k].Subtract(v)
}

for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Subtract(v)
for k, v := range a.ByClassInDC {
result.ByClassInDC[k] = b.ByClassInDC[k].Subtract(v)
}

return result
Expand Down
62 changes: 39 additions & 23 deletions nomad/blocked_evals_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ func TestBlockedResourceStats_New(t *testing.T) {
a := NewBlockedResourcesStats()
require.NotNil(t, a.ByJob)
require.Empty(t, a.ByJob)
require.NotNil(t, a.ByNode)
require.Empty(t, a.ByNode)
require.NotNil(t, a.ByClassInDC)
require.Empty(t, a.ByClassInDC)
}

var (
Expand All @@ -143,15 +143,20 @@ var (
Namespace: "two",
}

node1 = node{
node1 = classInDC{
dc: "dc1",
class: "alpha",
}

node2 = node{
node2 = classInDC{
dc: "dc1",
class: "beta",
}

node3 = classInDC{
dc: "dc1",
class: "", // not set
}
)

func TestBlockedResourceStats_Copy(t *testing.T) {
Expand All @@ -166,7 +171,7 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
MemoryMB: 256,
},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {
Timestamp: now1,
CPU: 300,
Expand All @@ -180,23 +185,23 @@ func TestBlockedResourceStats_Copy(t *testing.T) {
CPU: 888,
MemoryMB: 888,
}
c.ByNode[node1] = BlockedResourcesSummary{
c.ByClassInDC[node1] = BlockedResourcesSummary{
Timestamp: now2,
CPU: 999,
MemoryMB: 999,
}

// underlying data should have been deep copied
require.Equal(t, 100, a.ByJob[id1].CPU)
require.Equal(t, 300, a.ByNode[node1].CPU)
require.Equal(t, 300, a.ByClassInDC[node1].CPU)
}

func TestBlockedResourcesStats_Add(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
id1: {Timestamp: now(1), CPU: 111, MemoryMB: 222},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(2), CPU: 333, MemoryMB: 444},
}

Expand All @@ -205,7 +210,7 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id1: {Timestamp: now(3), CPU: 200, MemoryMB: 300},
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}
b.ByNode = map[node]BlockedResourcesSummary{
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(5), CPU: 600, MemoryMB: 700},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}
Expand All @@ -218,10 +223,10 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}, result.ByJob)

require.Equal(t, map[node]BlockedResourcesSummary{
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(5), CPU: 933, MemoryMB: 1144},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}, result.ByNode)
}, result.ByClassInDC)
})

// make sure we handle zeros in both directions
Expand All @@ -233,20 +238,31 @@ func TestBlockedResourcesStats_Add(t *testing.T) {
id2: {Timestamp: now(4), CPU: 400, MemoryMB: 500},
}, result.ByJob)

require.Equal(t, map[node]BlockedResourcesSummary{
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(2), CPU: 933, MemoryMB: 1144},
node2: {Timestamp: now(6), CPU: 800, MemoryMB: 900},
}, result.ByNode)
}, result.ByClassInDC)
})
}

func TestBlockedResourcesStats_Add_NoClass(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node3: {Timestamp: now(1), CPU: 111, MemoryMB: 1111},
}
result := a.Add(a)
require.Equal(t, map[classInDC]BlockedResourcesSummary{
node3: {Timestamp: now(1), CPU: 222, MemoryMB: 2222},
}, result.ByClassInDC)
}

func TestBlockedResourcesStats_Subtract(t *testing.T) {
a := NewBlockedResourcesStats()
a.ByJob = map[structs.NamespacedID]BlockedResourcesSummary{
id1: {Timestamp: now(1), CPU: 100, MemoryMB: 100},
id2: {Timestamp: now(2), CPU: 200, MemoryMB: 200},
}
a.ByNode = map[node]BlockedResourcesSummary{
a.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(3), CPU: 300, MemoryMB: 300},
node2: {Timestamp: now(4), CPU: 400, MemoryMB: 400},
}
Expand All @@ -256,7 +272,7 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
id1: {Timestamp: now(5), CPU: 10, MemoryMB: 11},
id2: {Timestamp: now(6), CPU: 12, MemoryMB: 13},
}
b.ByNode = map[node]BlockedResourcesSummary{
b.ByClassInDC = map[classInDC]BlockedResourcesSummary{
node1: {Timestamp: now(7), CPU: 14, MemoryMB: 15},
node2: {Timestamp: now(8), CPU: 16, MemoryMB: 17},
}
Expand All @@ -274,14 +290,14 @@ func TestBlockedResourcesStats_Subtract(t *testing.T) {
require.Equal(t, 187, result.ByJob[id2].MemoryMB)

// node1
require.Equal(t, now(7), result.ByNode[node1].Timestamp)
require.Equal(t, 286, result.ByNode[node1].CPU)
require.Equal(t, 285, result.ByNode[node1].MemoryMB)
require.Equal(t, now(7), result.ByClassInDC[node1].Timestamp)
require.Equal(t, 286, result.ByClassInDC[node1].CPU)
require.Equal(t, 285, result.ByClassInDC[node1].MemoryMB)

// node2
require.Equal(t, now(8), result.ByNode[node2].Timestamp)
require.Equal(t, 384, result.ByNode[node2].CPU)
require.Equal(t, 383, result.ByNode[node2].MemoryMB)
require.Equal(t, now(8), result.ByClassInDC[node2].Timestamp)
require.Equal(t, 384, result.ByClassInDC[node2].CPU)
require.Equal(t, 383, result.ByClassInDC[node2].MemoryMB)
}

// testBlockedEvalsRandomBlockedEval wraps an eval that is randomly generated.
Expand Down Expand Up @@ -363,9 +379,9 @@ func clearTimestampFromBlockedResourceStats(b *BlockedResourcesStats) {
v.Timestamp = time.Time{}
b.ByJob[k] = v
}
for k, v := range b.ByNode {
for k, v := range b.ByClassInDC {
v.Timestamp = time.Time{}
b.ByNode[k] = v
b.ByClassInDC[k] = v
}
}

Expand Down