Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement SBD watchdog and msgwait metrics #174

Merged
merged 7 commits into from
Sep 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions collector/sbd/sbd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"os/exec"
"regexp"
"strconv"
"strings"

"github.com/pkg/errors"
Expand All @@ -20,6 +21,7 @@ const subsystem = "sbd"
const SBD_STATUS_UNHEALTHY = "unhealthy"
const SBD_STATUS_HEALTHY = "healthy"

// NewCollector create a new sbd collector
func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
err := checkArguments(sbdPath, sbdConfigPath)
if err != nil {
Expand All @@ -33,6 +35,7 @@ func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
}

c.SetDescriptor("devices", "SBD devices; one line per device", []string{"device", "status"})
c.SetDescriptor("timeouts", "SBD timeouts for each device and type", []string{"device", "type"})

return c, nil
}
Expand Down Expand Up @@ -68,6 +71,15 @@ func (c *sbdCollector) CollectWithError(ch chan<- prometheus.Metric) error {
ch <- c.MakeGaugeMetric("devices", 1, sbdDev, sbdStatus)
}

sbdWatchdogs, sbdMsgWaits := c.getSbdTimeouts(sbdDevices)
for sbdDev, sbdWatchdog := range sbdWatchdogs {
ch <- c.MakeGaugeMetric("timeouts", sbdWatchdog, sbdDev, "watchdog")
}

for sbdDev, sbdMsgWait := range sbdMsgWaits {
ch <- c.MakeGaugeMetric("timeouts", sbdMsgWait, sbdDev, "msgwait")
}

return nil
}

Expand Down Expand Up @@ -132,3 +144,39 @@ func (c *sbdCollector) getSbdDeviceStatuses(sbdDevices []string) map[string]stri

return sbdStatuses
}

// for each sbd device, extract the watchdog and msgwait timeout via regex
func (c *sbdCollector) getSbdTimeouts(sbdDevices []string) (map[string]float64, map[string]float64) {
sbdWatchdogs := make(map[string]float64)
sbdMsgWaits := make(map[string]float64)
for _, sbdDev := range sbdDevices {
sbdDump, _ := exec.Command(c.sbdPath, "-d", sbdDev, "dump").Output()

regexW := regexp.MustCompile(`Timeout \(msgwait\) *: \d+`)
regex := regexp.MustCompile(`Timeout \(watchdog\) *: \d+`)

msgWaitLine := regexW.FindStringSubmatch(string(sbdDump))
watchdogLine := regex.FindStringSubmatch(string(sbdDump))

if watchdogLine == nil || msgWaitLine == nil {
continue
}

// get the timeout from the line
regexNumber := regexp.MustCompile(`\d+`)
watchdogTimeout := regexNumber.FindString(string(watchdogLine[0]))
msgWaitTimeout := regexNumber.FindString(string(msgWaitLine[0]))

// map the timeout to the device
if s, err := strconv.ParseFloat(watchdogTimeout, 64); err == nil {
sbdWatchdogs[sbdDev] = s
}

// map the timeout to the device
if s, err := strconv.ParseFloat(msgWaitTimeout, 64); err == nil {
sbdMsgWaits[sbdDev] = s
}

}
return sbdWatchdogs, sbdMsgWaits
}
9 changes: 8 additions & 1 deletion collector/sbd/sbd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@ func TestNewSbdCollectorChecksSbdExecutableBits(t *testing.T) {
}

func TestSBDCollector(t *testing.T) {
collector, _ := NewCollector("../../test/fake_sbd.sh", "../../test/fake_sbdconfig")
collector, _ := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")
assertcustom.Metrics(t, collector, "sbd.metrics")
}

func TestWatchdog(t *testing.T) {
collector, err := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")

assert.Nil(t, err)
assertcustom.Metrics(t, collector, "sbd.metrics")
}
15 changes: 14 additions & 1 deletion doc/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ The status of each Corosync ring; `1` means healthy, `0` means faulty.
The SBD subsystems collect devices stats by parsing its configuration and the output of `sbd --dump`.

0. [Sample](../test/sbd.metrics)
2. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
1. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
2. [`ha_cluster_sbd_timeouts`](#ha_cluster_sbd_timeouts)

### `ha_cluster_sbd_devices`

Expand All @@ -214,6 +215,18 @@ Either the value is `1`, or the line is absent altogether.

The total number of lines for this metric will be the cardinality of `device`.

### `ha_cluster_sbd_timeouts`

#### Description

The SBD timeouts pro SBD device
Value is an integer expessing the timeout

#### Labels

- `device`: the path of the SBD device
- `type`: either `watchdog` or `msgwait`


## DRBD

Expand Down
14 changes: 14 additions & 0 deletions test/fake_sbd_dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

cat <<EOF
==Dumping header on disk /dev/vdc
Header version : 2.1
UUID : 1ed3171d-066d-47ca-8f76-aec25d9efed4
Number of slots : 255
Sector size : 512
Timeout (watchdog) : 9
Timeout (allocate) : 2
Timeout (loop) : 1
Timeout (msgwait) : 10
==Header on disk /dev/vdc is dumped
EOF
8 changes: 7 additions & 1 deletion test/sbd.metrics
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# HELP ha_cluster_sbd_devices SBD devices; one line per device
# TYPE ha_cluster_sbd_devices gauge
ha_cluster_sbd_devices{device="/dev/vdc",status="healthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="unhealthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="healthy"} 1
# HELP ha_cluster_sbd_timeouts SBD timeouts for each device and type
# TYPE ha_cluster_sbd_timeouts gauge
ha_cluster_sbd_timeouts{device="/dev/vdc",type="msgwait"} 10
ha_cluster_sbd_timeouts{device="/dev/vdc",type="watchdog"} 9
ha_cluster_sbd_timeouts{device="/dev/vdd",type="msgwait"} 10
ha_cluster_sbd_timeouts{device="/dev/vdd",type="watchdog"} 9