Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added logging of Slurm subprocess failures and testing for all parsing functions #43

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8420a24
Corrected function name in cpus_test.go
Rovanion Feb 12, 2021
a04f2a2
Split tests into unit and system tests
Rovanion Feb 12, 2021
e7064fd
Corrected formatting of the project
Rovanion Feb 18, 2021
e5d513b
Add prometheus-slurm-exporter to .gitignore
Rovanion Feb 22, 2021
66472ef
Added Logging library
Rovanion Feb 22, 2021
38aae50
Made accounts.go print error messages from subprocesses
Rovanion Feb 22, 2021
af6975f
Added two logging functions for use in utility functions
Rovanion Feb 24, 2021
403a39f
Extracted the unix command execution from accounts.go to a generic su…
Rovanion Feb 24, 2021
706748f
Refactored cpus to use subprocess.go
Rovanion Feb 25, 2021
8644bba
Refactored gpus.go to use subprocess.go and also added unit and syste…
Rovanion Feb 25, 2021
2b46da1
Refactored nodes.go to use subprocess.go
Rovanion Feb 25, 2021
95c6ffb
Refactored partitions.go to use subprocess.go
Rovanion Feb 26, 2021
fe6be80
Added system and unit tests for partition data gathering
Rovanion Feb 26, 2021
2fd60b2
Refactored queue.go to use subprocess.go
Rovanion Feb 26, 2021
f1b1d22
Added real unit tests for queue.go
Rovanion Feb 26, 2021
260c772
Refactored scheduler.go to use subprocess.go
Rovanion Feb 26, 2021
e851819
Added actual unit tests for scheduler.go
Rovanion Feb 26, 2021
00a4cbc
Refactored sshare.go to use subprocess.go
Rovanion Mar 1, 2021
13c7526
Added tests for sshare.go
Rovanion Mar 1, 2021
bb630bd
Refactored users.go to use subprocess.go
Rovanion Mar 1, 2021
d1b68e6
Added tests for users.go
Rovanion Mar 1, 2021
fbda610
Clearify an argument name in accounts.go
Rovanion Mar 1, 2021
3574e0e
Added tests for accounts.go
Rovanion Mar 1, 2021
bcc30db
Corrected function name in nodes_system_test.go
Rovanion Mar 1, 2021
ca0012f
Corrected imports in system tests for partitions, users and sshare
Rovanion Mar 1, 2021
ec689e7
Corrected function name in users_system_test.go
Rovanion Mar 1, 2021
a755555
Implemented an actual unit tests for cpus.go's ParseCPUsMetrics
Rovanion Mar 1, 2021
dca9b63
Wrote actual tests for nodes.go
Rovanion Mar 2, 2021
9a3570b
Merge remote-tracking branch 'origin/master' into log-slurm-failure-o…
Rovanion Aug 4, 2021
9afe3af
Refactored node.go to use subprocess.go
Rovanion Aug 4, 2021
611df5a
Revert e7064fd, undoing style fixes
Rovanion Aug 4, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
bin/
*.snap
prometheus-slurm-exporter
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ PROJECT_NAME = prometheus-slurm-exporter
ifndef GOPATH
GOPATH=$(shell pwd):/usr/share/gocode
endif
GOFILES=accounts.go cpus.go gpus.go main.go node.go nodes.go partitions.go queue.go scheduler.go sshare.go users.go
GOFILES=accounts.go cpus.go gpus.go main.go node.go nodes.go partitions.go queue.go scheduler.go sshare.go users.go log.go subprocess.go
GOBIN=bin/$(PROJECT_NAME)

build:
Expand All @@ -13,6 +13,12 @@ build:
test:
@GOPATH=$(GOPATH) go test -v *.go

unittest:
@GOPATH=$(GOPATH) go test -v --tags unit

systemtest:
@GOPATH=$(GOPATH) go test -v --tags system

run:
@GOPATH=$(GOPATH) go run $(GOFILES)

Expand Down
26 changes: 6 additions & 20 deletions accounts.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright 2020 Victor Penso
Copyright 2021 Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -16,31 +17,12 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"io/ioutil"
"os/exec"
"log"
"strings"
"strconv"
"regexp"
"github.com/prometheus/client_golang/prometheus"
)

func AccountsData() []byte {
cmd := exec.Command("squeue","-a","-r","-h","-o %A|%a|%T|%C")
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
}

type JobMetrics struct {
pending float64
running float64
Expand Down Expand Up @@ -78,6 +60,10 @@ func ParseAccountsMetrics(input []byte) map[string]*JobMetrics {
return accounts
}

func GetAccountsMetrics() map[string]*JobMetrics {
return ParseAccountsMetrics(Subprocess("squeue", "-a", "-r", "-h", "-o %A|%a|%T|%C"))
}

type AccountsCollector struct {
pending *prometheus.Desc
running *prometheus.Desc
Expand All @@ -103,7 +89,7 @@ func (ac *AccountsCollector) Describe(ch chan<- *prometheus.Desc) {
}

func (ac *AccountsCollector) Collect(ch chan<- prometheus.Metric) {
am := ParseAccountsMetrics(AccountsData())
am := GetAccountsMetrics()
for a := range am {
if am[a].pending > 0 {
ch <- prometheus.MustNewConstMetric(ac.pending, prometheus.GaugeValue, am[a].pending, a)
Expand Down
25 changes: 25 additions & 0 deletions accounts_system_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// +build system

/* Copyright 2021 Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"testing"
)

func TestGetAccountsMetrics(t *testing.T) {
t.Logf("%+v", GetAccountsMetrics())
}
44 changes: 44 additions & 0 deletions accounts_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// +build unit

/* Copyright 2021 Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
package main

import (
"io/ioutil"
"os"
"testing"
)

func TestAccountsMetrics(t *testing.T) {
// Read the input data from a file
file, _ := os.Open("test_data/squeue_no_accounts.txt")
data, _ := ioutil.ReadAll(file)
accounts := ParseAccountsMetrics(data)

if accounts["(null)"].pending != 449.0 {
t.Errorf("Miscount of pending account jobs, got: %v, wanted: %f", accounts["(null)"].pending, 449.0)
}
if accounts["(null)"].running != 79.0 {
t.Errorf("Miscount of running account jobs, got: %v, wanted: %f", accounts["(null)"].running, 79.0)
}
if accounts["(null)"].running_cpus != 798.0 {
t.Errorf("Miscount of running_cpus account jobs, got: %v, wanted: %f", accounts["(null)"].running_cpus, 798.0)
}
if accounts["(null)"].suspended != 0.0 {
t.Errorf("Miscount of suspended account jobs, got: %v, wanted: %f", accounts["(null)"].suspended, 0.0)
}

}
25 changes: 3 additions & 22 deletions cpus.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright 2017 Victor Penso, Matteo Dessalvi
Copyright 2021 Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -17,9 +18,6 @@ package main

import (
"github.com/prometheus/client_golang/prometheus"
"io/ioutil"
"log"
"os/exec"
"strconv"
"strings"
)
Expand All @@ -31,10 +29,6 @@ type CPUsMetrics struct {
total float64
}

func CPUsGetMetrics() *CPUsMetrics {
return ParseCPUsMetrics(CPUsData())
}

func ParseCPUsMetrics(input []byte) *CPUsMetrics {
var cm CPUsMetrics
if strings.Contains(string(input), "/") {
Expand All @@ -47,21 +41,8 @@ func ParseCPUsMetrics(input []byte) *CPUsMetrics {
return &cm
}

// Execute the sinfo command and return its output
func CPUsData() []byte {
cmd := exec.Command("sinfo", "-h", "-o %C")
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
func CPUsGetMetrics() *CPUsMetrics {
return ParseCPUsMetrics(Subprocess("sinfo", "-h", "-o %C"))
}

/*
Expand Down
26 changes: 26 additions & 0 deletions cpus_system_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// +build system

/* Copyright 2017 Victor Penso, Matteo Dessalvi, Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */

package main

import (
"testing"
)

func TestCPUsGetMetrics(t *testing.T) {
t.Logf("%+v", CPUsGetMetrics())
}
28 changes: 17 additions & 11 deletions cpus_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/* Copyright 2017 Victor Penso, Matteo Dessalvi
// +build unit

/* Copyright 2017 Victor Penso, Matteo Dessalvi, Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -22,15 +24,19 @@ import (
)

func TestCPUsMetrics(t *testing.T) {
// Read the input data from a file
file, err := os.Open("test_data/sinfo_cpus.txt")
if err != nil {
t.Fatalf("Can not open test data: %v", err)
file, _ := os.Open("test_data/sinfo_cpus.txt")
data, _ := ioutil.ReadAll(file)
cpus := ParseCPUsMetrics(data)
if cpus.alloc != 5725.0 {
t.Errorf("Miscount of alloc CPUs, got: %v, expected: %f", cpus.alloc, 5725.0)
}
if cpus.idle != 877.0 {
t.Errorf("Miscount of idle CPUs, got: %v, expected: %f", cpus.idle, 877.0)
}
if cpus.other != 34.0 {
t.Errorf("Miscount of other CPUs, got: %v, expected: %f", cpus.other, 34.0)
}
if cpus.total != 6636.0 {
t.Errorf("Miscount of total CPUs, got: %v, expected: %f", cpus.total, 6636.0)
}
data, err := ioutil.ReadAll(file)
t.Logf("%+v", ParseCPUsMetrics(data))
}

func TestCPUssGetMetrics(t *testing.T) {
t.Logf("%+v", CPUsGetMetrics())
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ go 1.12
require (
github.com/prometheus/client_golang v1.2.1
github.com/prometheus/common v0.7.0
github.com/stretchr/testify v1.3.0 // indirect
)
61 changes: 21 additions & 40 deletions gpus.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright 2020 Joeri Hermans, Victor Penso, Matteo Dessalvi
Copyright 2021 Rovanion Luckey

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -17,11 +18,8 @@ package main

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"io/ioutil"
"os/exec"
"strings"
"strconv"
"strings"
)

type GPUsMetrics struct {
Expand All @@ -31,15 +29,9 @@ type GPUsMetrics struct {
utilization float64
}

func GPUsGetMetrics() *GPUsMetrics {
return ParseGPUsMetrics()
}

func ParseAllocatedGPUs() float64 {
func ParseAllocatedGPUs(sacctOutput []byte) float64 {
var num_gpus = 0.0

args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"}
output := string(Execute("sacct", args))
output := string(sacctOutput)
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
Expand All @@ -50,59 +42,48 @@ func ParseAllocatedGPUs() float64 {
}
}
}

return num_gpus
}

func ParseTotalGPUs() float64 {
var num_gpus = 0.0
func GetAllocatedGPUs() float64 {
return ParseAllocatedGPUs(
Subprocess("sacct", "-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"))
}

args := []string{"-h", "-o \"%n %G\""}
output := string(Execute("sinfo", args))
func ParseTotalGPUs(sinfoOutput []byte) float64 {
var num_gpus = 0.0
output := string(sinfoOutput)
if len(output) > 0 {
for _, line := range strings.Split(output, "\n") {
if len(line) > 0 {
line = strings.Trim(line, "\"")
descriptor := strings.Fields(line)[1]
descriptor = strings.TrimPrefix(descriptor, "gpu:")
descriptor = strings.Split(descriptor, "(")[0]
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
node_gpus, _ := strconv.ParseFloat(descriptor, 64)
num_gpus += node_gpus
}
}
}

return num_gpus
}

func ParseGPUsMetrics() *GPUsMetrics {
func GetTotalGPUs() float64 {
return ParseTotalGPUs(
Subprocess("sinfo", "-h", "-o \"%n %G\""))
}

func GetGPUsMetrics() *GPUsMetrics {
var gm GPUsMetrics
total_gpus := ParseTotalGPUs()
allocated_gpus := ParseAllocatedGPUs()
total_gpus := GetTotalGPUs()
allocated_gpus := GetAllocatedGPUs()
gm.alloc = allocated_gpus
gm.idle = total_gpus - allocated_gpus
gm.total = total_gpus
gm.utilization = allocated_gpus / total_gpus
return &gm
}

// Execute the sinfo command and return its output
func Execute(command string, arguments []string) []byte {
cmd := exec.Command(command, arguments...)
stdout, err := cmd.StdoutPipe()
if err != nil {
log.Fatal(err)
}
if err := cmd.Start(); err != nil {
log.Fatal(err)
}
out, _ := ioutil.ReadAll(stdout)
if err := cmd.Wait(); err != nil {
log.Fatal(err)
}
return out
}

/*
* Implement the Prometheus Collector interface and feed the
* Slurm scheduler metrics into it.
Expand Down Expand Up @@ -133,7 +114,7 @@ func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- cc.utilization
}
func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
cm := GPUsGetMetrics()
cm := GetGPUsMetrics()
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
Expand Down
Loading