Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update gpus.go for gpu_type and Slurm>=19.05.0rc1 #73

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/vpenso/prometheus-slurm-exporter
module github.com/itzsimpl/prometheus-slurm-exporter

go 1.12

Expand Down
169 changes: 138 additions & 31 deletions gpus.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi
/* Copyright 2022 Joeri Hermans, Victor Penso, Matteo Dessalvi, Iztok Lebar Bajec

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -16,17 +16,19 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"io/ioutil"
"os/exec"
"strings"
"regexp"
"strconv"
"strings"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)

type GPUsMetrics struct {
alloc float64
idle float64
other float64
total float64
utilization float64
}
Expand All @@ -35,6 +37,11 @@ func GPUsGetMetrics() *GPUsMetrics {
return ParseGPUsMetrics()
}

/* TODO:
sinfo has gresUSED since slurm>=19.05.0rc01 https://github.com/SchedMD/slurm/blob/master/NEWS
revert to old process on slurm<19.05.0rc01
--format=AllocGRES will return gres/gpu=8
--format=AllocTRES will return billing=16,cpu=16,gres/gpu=8,mem=256G,node=1
func ParseAllocatedGPUs() float64 {
var num_gpus = 0.0

Expand All @@ -53,21 +60,106 @@ func ParseAllocatedGPUs() float64 {

return num_gpus
}
*/

func ParseTotalGPUs() float64 {
func ParseAllocatedGPUs(data []byte) float64 {
var num_gpus = 0.0
// sinfo -a -h --Format="Nodes: ,GresUsed:" --state=allocated
// 3 gpu:2 # slurm>=20.11.8
// 1 gpu:(null):3(IDX:0-7) # slurm 21.08.5
// 13 gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5

sinfo_lines := string(data)
re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
if len(sinfo_lines) > 0 {
for _, line := range strings.Split(sinfo_lines, "\n") {
// log.info(line)
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_active_gpus := strings.Fields(line)[1]
num_node_active_gpus := 0.0
for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
if strings.Contains(node_active_gpus_type, "gpu:") {
node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
num_node_active_gpus += num_node_active_gpus_type
}
}
num_gpus += num_nodes * num_node_active_gpus
}
}
}

args := []string{"-h", "-o \"%n %G\""}
output := string(Execute("sinfo", args))
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
line = strings.Trim(line, "\"")
descriptor := strings.Fields(line)[1]
descriptor = strings.TrimPrefix(descriptor, "gpu:")
descriptor = strings.Split(descriptor, "(")[0]
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
num_gpus += node_gpus
return num_gpus
}

func ParseIdleGPUs(data []byte) float64 {
var num_gpus = 0.0
// sinfo -a -h --Format="Nodes: ,Gres: ,GresUsed:" --state=idle,allocated
// 3 gpu:4 gpu:2 # slurm 20.11.8
// 1 gpu:8(S:0-1) gpu:(null):3(IDX:0-7) # slurm 21.08.5
// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) gpu:A30:4(IDX:0-3),gpu:Q6K:4(IDX:0-3) # slurm 21.08.5

sinfo_lines := string(data)
re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
if len(sinfo_lines) > 0 {
for _, line := range strings.Split(sinfo_lines, "\n") {
// log.info(line)
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_gpus := strings.Fields(line)[1]
num_node_gpus := 0.0
for _, node_gpus_type := range strings.Split(node_gpus, ",") {
if strings.Contains(node_gpus_type, "gpu:") {
node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
num_node_gpus += num_node_gpus_type
}
}
num_node_active_gpus := 0.0
node_active_gpus := strings.Fields(line)[2]
for _, node_active_gpus_type := range strings.Split(node_active_gpus, ",") {
if strings.Contains(node_active_gpus_type, "gpu:") {
node_active_gpus_type = re.FindStringSubmatch(node_active_gpus_type)[2]
num_node_active_gpus_type, _ := strconv.ParseFloat(node_active_gpus_type, 64)
num_node_active_gpus += num_node_active_gpus_type
}
}
num_gpus += num_nodes * (num_node_gpus - num_node_active_gpus)
}
}
}

return num_gpus
}

func ParseTotalGPUs(data []byte) float64 {
var num_gpus = 0.0
// sinfo -a -h --Format="Nodes: ,Gres:"
// 3 gpu:4 # slurm 20.11.8
// 1 gpu:8(S:0-1) # slurm 21.08.5
// 13 gpu:A30:4(S:0-1),gpu:Q6K:40(S:0-1) # slurm 21.08.5

sinfo_lines := string(data)
re := regexp.MustCompile(`gpu:(\(null\)|[^:(]*):?([0-9]+)(\([^)]*\))?`)
if len(sinfo_lines) > 0 {
for _, line := range strings.Split(sinfo_lines, "\n") {
// log.Info(line)
if len(line) > 0 && strings.Contains(line, "gpu:") {
nodes := strings.Fields(line)[0]
num_nodes, _ := strconv.ParseFloat(nodes, 64)
node_gpus := strings.Fields(line)[1]
num_node_gpus := 0.0
for _, node_gpus_type := range strings.Split(node_gpus, ",") {
if strings.Contains(node_gpus_type, "gpu:") {
node_gpus_type = re.FindStringSubmatch(node_gpus_type)[2]
num_node_gpus_type, _ := strconv.ParseFloat(node_gpus_type, 64)
num_node_gpus += num_node_gpus_type
}
}
num_gpus += num_nodes * num_node_gpus
}
}
}
Expand All @@ -77,29 +169,40 @@ func ParseTotalGPUs() float64 {

func ParseGPUsMetrics() *GPUsMetrics {
var gm GPUsMetrics
total_gpus := ParseTotalGPUs()
allocated_gpus := ParseAllocatedGPUs()
total_gpus := ParseTotalGPUs(TotalGPUsData())
allocated_gpus := ParseAllocatedGPUs(AllocatedGPUsData())
idle_gpus := ParseIdleGPUs(IdleGPUsData())
other_gpus := total_gpus - allocated_gpus - idle_gpus
gm.alloc = allocated_gpus
gm.idle = total_gpus - allocated_gpus
gm.idle = idle_gpus
gm.other = other_gpus
gm.total = total_gpus
gm.utilization = allocated_gpus / total_gpus
return &gm
}

func AllocatedGPUsData() []byte {
args := []string{"-a", "-h", "--Format=Nodes: ,GresUsed:", "--state=allocated"}
return Execute("sinfo", args)
}

func IdleGPUsData() []byte {
args := []string{"-a", "-h", "--Format=Nodes: ,Gres: ,GresUsed:", "--state=idle,allocated"}
return Execute("sinfo", args)
}

func TotalGPUsData() []byte {
args := []string{"-a", "-h", "--Format=Nodes: ,Gres:"}
return Execute("sinfo", args)
}

// Execute the sinfo command and return its output
func Execute(command string, arguments []string) []byte {
cmd := exec.Command(command, arguments...)
stdout, err := cmd.StdoutPipe()
out, err := cmd.CombinedOutput()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
}

Expand All @@ -111,16 +214,18 @@ func Execute(command string, arguments []string) []byte {

func NewGPUsCollector() *GPUsCollector {
return &GPUsCollector{
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
other: prometheus.NewDesc("slurm_gpus_other", "Other GPUs", nil, nil),
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
}
}

type GPUsCollector struct {
alloc *prometheus.Desc
idle *prometheus.Desc
other *prometheus.Desc
total *prometheus.Desc
utilization *prometheus.Desc
}
Expand All @@ -129,13 +234,15 @@ type GPUsCollector struct {
func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- cc.alloc
ch <- cc.idle
ch <- cc.other
ch <- cc.total
ch <- cc.utilization
}
func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
cm := GPUsGetMetrics()
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
}
63 changes: 63 additions & 0 deletions gpus_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/* Copyright 2022 Iztok Lebar Bajec

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

package main

import (
"io/ioutil"
"os"
"path/filepath"
"strings"
"testing"
)

func TestGPUsMetrics(t *testing.T) {
test_data_paths, _ := filepath.Glob("test_data/slurm-*")
for _, test_data_path := range test_data_paths {
slurm_version := strings.TrimPrefix(test_data_path, "test_data/slurm-")
t.Logf("slurm-%s", slurm_version)

// Read the input data from a file
file, err := os.Open(test_data_path + "/sinfo_gpus_allocated.txt")
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
data, _ := ioutil.ReadAll(file)
metrics := ParseAllocatedGPUs(data)
t.Logf("Allocated: %+v", metrics)

// Read the input data from a file
file, err = os.Open(test_data_path + "/sinfo_gpus_idle.txt")
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
data, _ = ioutil.ReadAll(file)
metrics = ParseIdleGPUs(data)
t.Logf("Idle: %+v", metrics)

// Read the input data from a file
file, err = os.Open(test_data_path + "/sinfo_gpus_total.txt")
if err != nil {
t.Fatalf("Can not open test data: %v", err)
}
data, _ = ioutil.ReadAll(file)
metrics = ParseTotalGPUs(data)
t.Logf("Total: %+v", metrics)
}
}

func TestGPUsGetMetrics(t *testing.T) {
t.Logf("%+v", GPUsGetMetrics())
}
19 changes: 9 additions & 10 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ import (

// NodeMetrics stores metrics for each node
type NodeMetrics struct {
memAlloc uint64
memTotal uint64
cpuAlloc uint64
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
memAlloc uint64
memTotal uint64
cpuAlloc uint64
cpuIdle uint64
cpuOther uint64
cpuTotal uint64
nodeStatus string
}

Expand Down Expand Up @@ -60,7 +60,6 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
memTotal, _ := strconv.ParseUint(node[2], 10, 64)


cpuInfo := strings.Split(node[3], "/")
cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
Expand All @@ -82,7 +81,7 @@ func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
// NodeData executes the sinfo command to get data for each node
// It returns the output of the sinfo command
func NodeData() []byte {
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:")
out, err := cmd.Output()
if err != nil {
log.Fatal(err)
Expand All @@ -102,7 +101,7 @@ type NodeCollector struct {
// NewNodeCollector creates a Prometheus collector to keep all our stats in
// It returns a set of collections for consumption
func NewNodeCollector() *NodeCollector {
labels := []string{"node","status"}
labels := []string{"node", "status"}

return &NodeCollector{
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
Expand All @@ -128,7 +127,7 @@ func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
nodes := NodeGetMetrics()
for node := range nodes {
ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
Expand Down
5 changes: 5 additions & 0 deletions test_data/_slurm-17.11.2/sacct_gpus_allocated.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
gpu:4
gpu:7
gpu:1
gpu:1
gpu:1
2 changes: 2 additions & 0 deletions test_data/_slurm-17.11.2/sinfo_gpus_total.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
xantipa gpu:8
xantipa2 gpu:8
3 changes: 3 additions & 0 deletions test_data/slurm-20.11.8/sinfo_gpus_allocated.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
98 gpu:0
3 gpu:2
1 gpu:1
4 changes: 4 additions & 0 deletions test_data/slurm-20.11.8/sinfo_gpus_idle.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
99 (null) gpu:0
3 gpu:2 gpu:2
1 gpu:2 gpu:1
20 gpu:2 gpu:0
2 changes: 2 additions & 0 deletions test_data/slurm-20.11.8/sinfo_gpus_total.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
101 (null)
24 gpu:2
Empty file.
1 change: 1 addition & 0 deletions test_data/slurm-21.08.5/sinfo_gpus_idle.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2 gpu:8 gpu:(null):0(IDX:N/A)
1 change: 1 addition & 0 deletions test_data/slurm-21.08.5/sinfo_gpus_total.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2 gpu:8