Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SBD watchdog and msgwait metrics #174

Merged
merged 7 commits into from
Sep 3, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions collector/sbd/sbd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"os/exec"
"regexp"
"strconv"
"strings"

"github.com/pkg/errors"
Expand All @@ -20,6 +21,7 @@ const subsystem = "sbd"
const SBD_STATUS_UNHEALTHY = "unhealthy"
const SBD_STATUS_HEALTHY = "healthy"

// NewCollector create a new sbd collector
func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
err := checkArguments(sbdPath, sbdConfigPath)
if err != nil {
Expand All @@ -33,6 +35,8 @@ func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
}

c.SetDescriptor("devices", "SBD devices; one line per device", []string{"device", "status"})
c.SetDescriptor("watchdog_timeout", "sbd watchdog timeout", []string{"device"})
c.SetDescriptor("msgwait_timeout", "sbd msgwait timeout", []string{"device"})

return c, nil
}
Expand Down Expand Up @@ -68,6 +72,16 @@ func (c *sbdCollector) CollectWithError(ch chan<- prometheus.Metric) error {
ch <- c.MakeGaugeMetric("devices", 1, sbdDev, sbdStatus)
}

sbdWatchdogs := c.getSbdWatchDogTimeout(sbdDevices)
for sbdDev, sbdWatchdog := range sbdWatchdogs {
ch <- c.MakeGaugeMetric("watchdog_timeout", sbdWatchdog, sbdDev)
}

sbdMsgWaits := c.getSbdMsgWaitTimeout(sbdDevices)
for sbdDev, sbdMsgWait := range sbdMsgWaits {
ch <- c.MakeGaugeMetric("msgwait_timeout", sbdMsgWait, sbdDev)
}

return nil
}

Expand Down Expand Up @@ -132,3 +146,57 @@ func (c *sbdCollector) getSbdDeviceStatuses(sbdDevices []string) map[string]stri

return sbdStatuses
}

// for each sbd device, extract the watchdog timeout via regex
func (c *sbdCollector) getSbdWatchDogTimeout(sbdDevices []string) map[string]float64 {
sbdWatchdogs := make(map[string]float64)
for _, sbdDev := range sbdDevices {
sbdDump, _ := exec.Command(c.sbdPath, "-d", sbdDev, "dump").Output()

regex := regexp.MustCompile(`Timeout \(watchdog\) *: \d+`)
// we get this line: Timeout (watchdog) : 5
watchdogLine := regex.FindStringSubmatch(string(sbdDump))

if watchdogLine == nil {
continue
}
// get the timeout from the line
regexNumber := regexp.MustCompile(`\d+`)
watchdogTimeout := regexNumber.FindStringSubmatch(string(watchdogLine[0]))
if watchdogTimeout == nil {
continue
}
// map the timeout to the device
if s, err := strconv.ParseFloat(watchdogTimeout[0], 64); err == nil {
sbdWatchdogs[sbdDev] = s
}
}
return sbdWatchdogs
}

// for each sbd device, extract the msgWait timeout via regex
func (c *sbdCollector) getSbdMsgWaitTimeout(sbdDevices []string) map[string]float64 {
sbdMsgWaits := make(map[string]float64)
for _, sbdDev := range sbdDevices {
sbdDump, _ := exec.Command(c.sbdPath, "-d", sbdDev, "dump").Output()

regex := regexp.MustCompile(`Timeout \(msgwait\) *: \d+`)
// we get this line: Timeout (msgwait) : 5
msgWaitLine := regex.FindStringSubmatch(string(sbdDump))

if msgWaitLine == nil {
continue
}
// get the timeout from the line
regexNumber := regexp.MustCompile(`\d+`)
msgWaitTimeout := regexNumber.FindStringSubmatch(string(msgWaitLine[0]))
if msgWaitTimeout == nil {
continue
}
// map the timeout to the device
if s, err := strconv.ParseFloat(msgWaitTimeout[0], 64); err == nil {
sbdMsgWaits[sbdDev] = s
}
}
return sbdMsgWaits
}
MalloZup marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 8 additions & 1 deletion collector/sbd/sbd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@ func TestNewSbdCollectorChecksSbdExecutableBits(t *testing.T) {
}

func TestSBDCollector(t *testing.T) {
collector, _ := NewCollector("../../test/fake_sbd.sh", "../../test/fake_sbdconfig")
collector, _ := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")
assertcustom.Metrics(t, collector, "sbd.metrics")
}

func TestWatchdog(t *testing.T) {
collector, err := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")

assert.Nil(t, err)
assertcustom.Metrics(t, collector, "sbd.metrics")
}
14 changes: 14 additions & 0 deletions test/fake_sbd_dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

cat <<EOF
==Dumping header on disk /dev/vdc
Header version : 2.1
UUID : 1ed3171d-066d-47ca-8f76-aec25d9efed4
Number of slots : 255
Sector size : 512
Timeout (watchdog) : 9
Timeout (allocate) : 2
Timeout (loop) : 1
Timeout (msgwait) : 10
==Header on disk /dev/vdc is dumped
EOF
16 changes: 12 additions & 4 deletions test/sbd.metrics
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
# HELP ha_cluster_sbd_devices SBD devices; one line per device
# TYPE ha_cluster_sbd_devices gauge
ha_cluster_sbd_devices{device="/dev/vdc",status="healthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="unhealthy"} 1
# HELP ha_cluster_sbd_devices SBD devices; one line per device
# TYPE ha_cluster_sbd_devices gauge
ha_cluster_sbd_devices{device="/dev/vdc",status="healthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="healthy"} 1
# HELP ha_cluster_sbd_msgwait_timeout sbd msgwait timeout
# TYPE ha_cluster_sbd_msgwait_timeout gauge
ha_cluster_sbd_msgwait_timeout{device="/dev/vdc"} 10
ha_cluster_sbd_msgwait_timeout{device="/dev/vdd"} 10
# HELP ha_cluster_sbd_watchdog_timeout sbd watchdog timeout
# TYPE ha_cluster_sbd_watchdog_timeout gauge
ha_cluster_sbd_watchdog_timeout{device="/dev/vdc"} 9
ha_cluster_sbd_watchdog_timeout{device="/dev/vdd"} 9