Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add per-node gpu metrics (allocated/total) #57

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ type NodeMetrics struct {
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
gpuAlloc uint64
gpuTotal uint64
nodeStatus string
gpuType string
}

func NodeGetMetrics() map[string]*NodeMetrics {
Expand All @@ -55,7 +58,8 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
nodeName := node[0]
nodeStatus := node[4] // mixed, allocated, etc.

nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""}

nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, 0, 0, "", ""}

memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
memTotal, _ := strconv.ParseUint(node[2], 10, 64)
Expand All @@ -67,6 +71,19 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64)
cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64)

if node[5] != "(null)" {
// Ignore everything after opening parenthesis and split into type, name and count
availableTRES := strings.Split(strings.Split(node[5], "(")[0], ":")
usedTRES := strings.Split(strings.Split(node[6], "(")[0], ":")
gpuType := availableTRES[1]
gpuTotal, _ := strconv.ParseUint(availableTRES[2], 10, 64)
gpuAlloc, _ := strconv.ParseUint(usedTRES[2], 10, 64)

nodes[nodeName].gpuAlloc = gpuAlloc
nodes[nodeName].gpuTotal = gpuTotal
nodes[nodeName].gpuType = gpuType
}

nodes[nodeName].memAlloc = memAlloc
nodes[nodeName].memTotal = memTotal
nodes[nodeName].cpuAlloc = cpuAlloc
Expand All @@ -82,7 +99,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong,Gres:50,Gresused:50")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
Expand All @@ -97,12 +114,15 @@ type NodeCollector struct {
cpuTotal *prometheus.Desc
memAlloc *prometheus.Desc
memTotal *prometheus.Desc
gpuAlloc *prometheus.Desc
gpuTotal *prometheus.Desc
}

// NewNodeCollector creates a Prometheus collector to keep all our stats in
// It returns a set of collections for consumption
func NewNodeCollector() *NodeCollector {
labels := []string{"node","status"}
labels_gpu := []string{"node","status","gputype"}

return &NodeCollector{
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
Expand All @@ -111,6 +131,8 @@ func NewNodeCollector() *NodeCollector {
cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil),
memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil),
memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil),
gpuAlloc: prometheus.NewDesc("slurm_node_gpu_alloc", "Allocated GPUs per node", labels_gpu, nil),
gpuTotal: prometheus.NewDesc("slurm_node_gpu_total", "Total GPUs per node", labels_gpu, nil),
}
}

Expand All @@ -122,6 +144,8 @@ func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- nc.cpuTotal
ch <- nc.memAlloc
ch <- nc.memTotal
ch <- nc.gpuAlloc
ch <- nc.gpuTotal
}

func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
Expand All @@ -133,5 +157,9 @@ func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus)
if nodes[node].gpuType != "" {
ch <- prometheus.MustNewConstMetric(nc.gpuAlloc, prometheus.GaugeValue, float64(nodes[node].gpuAlloc), node, nodes[node].nodeStatus, nodes[node].gpuType)
ch <- prometheus.MustNewConstMetric(nc.gpuTotal, prometheus.GaugeValue, float64(nodes[node].gpuTotal), node, nodes[node].nodeStatus, nodes[node].gpuType)
}
}
}
5 changes: 4 additions & 1 deletion node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ slurm_node_cpus_other{name="a048",status="mix"} 0
slurm_node_cpus_total{name="a048",status="mix"} 16
slurm_node_mem_allocated{name="a048",status="mix"} 179384
slurm_node_mem_total{name="a048",status="mix"} 193000

slurm_node_gpu_allocated{gputype="rtx5000",name="a048",status="mix"} 2
slurm_node_gpu_total{gputype="rtx5000",name="a048",status="mix"} 4
*/

func TestNodeMetrics(t *testing.T) {
Expand All @@ -54,4 +55,6 @@ func TestNodeMetrics(t *testing.T) {
assert.Equal(t, uint64(0), metrics["b001"].cpuIdle)
assert.Equal(t, uint64(0), metrics["b001"].cpuOther)
assert.Equal(t, uint64(32), metrics["b001"].cpuTotal)
assert.Equal(t, uint64(4), metrics["b001"].gpuAlloc)
assert.Equal(t, uint64(4), metrics["b001"].gpuTotal)
}
8 changes: 8 additions & 0 deletions nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type NodesMetrics struct {
maint float64
mix float64
resv float64
plnd float64
}

func NodesGetMetrics() *NodesMetrics {
Expand Down Expand Up @@ -83,6 +84,7 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
maint := regexp.MustCompile(`^maint`)
mix := regexp.MustCompile(`^mix`)
resv := regexp.MustCompile(`^res`)
plnd := regexp.MustCompile(`^plan`)
switch {
case alloc.MatchString(state) == true:
nm.alloc += count
Expand All @@ -104,6 +106,8 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
nm.mix += count
case resv.MatchString(state) == true:
nm.resv += count
case plnd.MatchString(state) == true:
nm.plnd += count
}
}
}
Expand Down Expand Up @@ -145,6 +149,7 @@ func NewNodesCollector() *NodesCollector {
maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", nil, nil),
mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", nil, nil),
resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", nil, nil),
plnd: prometheus.NewDesc("slurm_nodes_plnd", "Planned nodes", nil, nil),
}
}

Expand All @@ -159,6 +164,7 @@ type NodesCollector struct {
maint *prometheus.Desc
mix *prometheus.Desc
resv *prometheus.Desc
plnd *prometheus.Desc
}

// Send all metric descriptions
Expand All @@ -173,6 +179,7 @@ func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- nc.maint
ch <- nc.mix
ch <- nc.resv
ch <- nc.plnd
}
func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
nm := NodesGetMetrics()
Expand All @@ -186,4 +193,5 @@ func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
ch <- prometheus.MustNewConstMetric(nc.maint, prometheus.GaugeValue, nm.maint)
ch <- prometheus.MustNewConstMetric(nc.mix, prometheus.GaugeValue, nm.mix)
ch <- prometheus.MustNewConstMetric(nc.resv, prometheus.GaugeValue, nm.resv)
ch <- prometheus.MustNewConstMetric(nc.plnd, prometheus.GaugeValue, nm.plnd)
}
6 changes: 3 additions & 3 deletions test_data/sinfo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ lxfoo0373,idle
lxfoo0374,idle
lxfoo0375,idle
lxfoo0376,idle
lxfoo0377,idle
lxfoo0377,planned
lxfoo0378,idle
lxfoo0379,idle
lxfoo0380,idle
Expand All @@ -390,7 +390,7 @@ lxfoo0389,idle
lxfoo0390,idle
lxfoo0391,idle
lxfoo0392,idle
lxfoo0393,idle
lxfoo0393,planned
lxfoo0394,idle
lxfoo0395,idle
lxfoo0396,idle
Expand Down Expand Up @@ -457,7 +457,7 @@ lxfoo0456,idle
lxfoo0457,idle
lxfoo0458,idle
lxfoo0459,idle
lxfoo0460,idle
lxfoo0460,planned
lxfoo0461,idle
lxfoo0462,idle
lxfoo0463,idle
Expand Down
42 changes: 21 additions & 21 deletions test_data/sinfo_mem.txt
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
a048 163840 193000 16/0/0/16 mixed
a048 163840 193000 16/0/0/16 mixed
a048 163840 193000 16/0/0/16 idle
a048 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a049 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a050 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a051 163840 193000 16/0/0/16 idle
a052 0 193000 0/16/0/16 idle
b001 327680 386000 32/0/0/32 down
b001 327680 386000 32/0/0/32 down
b002 327680 386000 32/0/0/32 down
b002 327680 386000 32/0/0/32 idle
b003 296960 386000 29/3/0/32 down
b003 296960 386000 29/3/0/32 idle
a048 163840 193000 16/0/0/16 mixed gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0-1)
a048 163840 193000 16/0/0/16 mixed gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0-1)
a048 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0-1)
a048 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0-1)
a049 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0,2)
a049 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0,2)
a049 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0,2)
a049 163840 193000 16/0/0/16 idle gpu:rtx5000:4(S:0-1) gpu:rtx5000:2(IDX:0,2)
a050 163840 193000 16/0/0/16 idle gpu:v100:8(S:0-1) gpu:v100:5(IDX:0,2-5)
a050 163840 193000 16/0/0/16 idle gpu:v100:8(S:0-1) gpu:v100:5(IDX:0,2-5)
a050 163840 193000 16/0/0/16 idle gpu:v100:8(S:0-1) gpu:v100:5(IDX:0,2-5)
a051 163840 193000 16/0/0/16 idle gpu:gtx1080:2 gpu:gtx1080:0(IDX:N/A)
a051 163840 193000 16/0/0/16 idle gpu:gtx1080:2 gpu:gtx1080:0(IDX:N/A)
a051 163840 193000 16/0/0/16 idle gpu:gtx1080:2 gpu:gtx1080:0(IDX:N/A)
a052 0 193000 0/16/0/16 idle gpu:gtx1080:2 gpu:gtx1080:2(IDX:0-1)
b001 327680 386000 32/0/0/32 down gpu:gtx980:4 gpu:gtx980:4(IDX:0-3)
b001 327680 386000 32/0/0/32 down gpu:gtx980:4 gpu:gtx980:4(IDX:0-3)
b002 327680 386000 32/0/0/32 down gpu:gtx980:4 gpu:gtx980:0(IDX:N/A)
b002 327680 386000 32/0/0/32 idle gpu:gtx980:4 gpu:gtx980:0(IDX:N/A)
b003 296960 386000 29/3/0/32 down gpu:k40:2 gpu:k40:1(IDX:0)
b003 296960 386000 29/3/0/32 idle gpu:k40:2 gpu:k40:1(IDX:0)