From 95efb86f6b49fe12ae69920eb660f2512cc1d64b Mon Sep 17 00:00:00 2001 From: Gavin Lam Date: Sat, 9 Mar 2024 04:00:06 -0500 Subject: [PATCH] Add new collector and metrics for watchdog (#2309) (#2880) Signed-off-by: Gavin Lam --- README.md | 1 + collector/fixtures/e2e-64k-page-output.txt | 26 ++++ collector/fixtures/e2e-output.txt | 26 ++++ collector/fixtures/sys.ttar | 69 +++++++++++ collector/watchdog.go | 133 +++++++++++++++++++++ collector/watchdog_test.go | 92 ++++++++++++++ end-to-end-test.sh | 1 + 7 files changed, 348 insertions(+) create mode 100644 collector/watchdog.go create mode 100644 collector/watchdog_test.go diff --git a/README.md b/README.md index 87b4e4ba01..5856df806c 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux +watchdog | Exposes statistics from `/sys/class/watchdog` | Linux wifi | Exposes WiFi device and station statistics. | Linux xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux zoneinfo | Exposes NUMA memory zone metrics. | Linux diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index c670fd9560..e3de810caf 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -2945,6 +2945,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1 node_scrape_collector_success{collector="time"} 1 node_scrape_collector_success{collector="udp_queues"} 1 node_scrape_collector_success{collector="vmstat"} 1 +node_scrape_collector_success{collector="watchdog"} 1 node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="xfrm"} 1 node_scrape_collector_success{collector="xfs"} 1 @@ -3218,6 +3219,31 @@ node_vmstat_pswpin 1476 # HELP node_vmstat_pswpout /proc/vmstat information field pswpout. # TYPE node_vmstat_pswpout untyped node_vmstat_pswpout 35045 +# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog//access_cs0 +# TYPE node_watchdog_access_cs0 gauge +node_watchdog_access_cs0{name="watchdog0"} 0 +# HELP node_watchdog_bootstatus Value of /sys/class/watchdog//bootstatus +# TYPE node_watchdog_bootstatus gauge +node_watchdog_bootstatus{name="watchdog0"} 1 +# HELP node_watchdog_fw_version Value of /sys/class/watchdog//fw_version +# TYPE node_watchdog_fw_version gauge +node_watchdog_fw_version{name="watchdog0"} 2 +# HELP node_watchdog_info Info of /sys/class/watchdog/ +# TYPE node_watchdog_info gauge +node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1 +node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1 +# HELP node_watchdog_nowayout Value of /sys/class/watchdog//nowayout +# TYPE node_watchdog_nowayout gauge +node_watchdog_nowayout{name="watchdog0"} 0 +# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog//pretimeout +# TYPE node_watchdog_pretimeout_seconds gauge +node_watchdog_pretimeout_seconds{name="watchdog0"} 120 +# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog//timeleft +# TYPE node_watchdog_timeleft_seconds gauge +node_watchdog_timeleft_seconds{name="watchdog0"} 300 +# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog//timeout +# TYPE node_watchdog_timeout_seconds gauge +node_watchdog_timeout_seconds{name="watchdog0"} 60 # HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz. # TYPE node_wifi_interface_frequency_hertz gauge node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 2caceedbb4..5b6cfbe1dd 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -2967,6 +2967,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1 node_scrape_collector_success{collector="time"} 1 node_scrape_collector_success{collector="udp_queues"} 1 node_scrape_collector_success{collector="vmstat"} 1 +node_scrape_collector_success{collector="watchdog"} 1 node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="xfrm"} 1 node_scrape_collector_success{collector="xfs"} 1 @@ -3240,6 +3241,31 @@ node_vmstat_pswpin 1476 # HELP node_vmstat_pswpout /proc/vmstat information field pswpout. # TYPE node_vmstat_pswpout untyped node_vmstat_pswpout 35045 +# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog//access_cs0 +# TYPE node_watchdog_access_cs0 gauge +node_watchdog_access_cs0{name="watchdog0"} 0 +# HELP node_watchdog_bootstatus Value of /sys/class/watchdog//bootstatus +# TYPE node_watchdog_bootstatus gauge +node_watchdog_bootstatus{name="watchdog0"} 1 +# HELP node_watchdog_fw_version Value of /sys/class/watchdog//fw_version +# TYPE node_watchdog_fw_version gauge +node_watchdog_fw_version{name="watchdog0"} 2 +# HELP node_watchdog_info Info of /sys/class/watchdog/ +# TYPE node_watchdog_info gauge +node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1 +node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1 +# HELP node_watchdog_nowayout Value of /sys/class/watchdog//nowayout +# TYPE node_watchdog_nowayout gauge +node_watchdog_nowayout{name="watchdog0"} 0 +# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog//pretimeout +# TYPE node_watchdog_pretimeout_seconds gauge +node_watchdog_pretimeout_seconds{name="watchdog0"} 120 +# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog//timeleft +# TYPE node_watchdog_timeleft_seconds gauge +node_watchdog_timeleft_seconds{name="watchdog0"} 300 +# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog//timeout +# TYPE node_watchdog_timeout_seconds gauge +node_watchdog_timeout_seconds{name="watchdog0"} 60 # HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz. # TYPE node_wifi_interface_frequency_hertz gauge node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 213c8b3867..0573cfd312 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0 Path: sys/class/thermal/thermal_zone0 SymlinkTo: ../../devices/virtual/thermal/thermal_zone0 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/watchdog +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/watchdog/watchdog0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/access_cs0 +Lines: 1 +0EOF +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/bootstatus +Lines: 1 +1EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/fw_version +Lines: 1 +2EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/identity +Lines: 1 +Software WatchdogEOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/nowayout +Lines: 1 +0EOF +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/options +Lines: 1 +0x8380EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/pretimeout +Lines: 1 +120EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/pretimeout_governor +Lines: 1 +noopEOF +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/state +Lines: 1 +activeEOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/status +Lines: 1 +0x8000EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/timeleft +Lines: 1 +300EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/watchdog/watchdog0/timeout +Lines: 1 +60EOF +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/watchdog/watchdog1 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/devices Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/collector/watchdog.go b/collector/watchdog.go new file mode 100644 index 0000000000..b4cb83d535 --- /dev/null +++ b/collector/watchdog.go @@ -0,0 +1,133 @@ +// Copyright 2023 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build linux && !nowatchdog +// +build linux,!nowatchdog + +package collector + +import ( + "fmt" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs/sysfs" +) + +type watchdogCollector struct { + fs sysfs.FS + logger log.Logger +} + +func init() { + registerCollector("watchdog", defaultDisabled, NewWatchdogCollector) +} + +// NewWatchdogCollector returns a new Collector exposing watchdog stats. +func NewWatchdogCollector(logger log.Logger) (Collector, error) { + fs, err := sysfs.NewFS(*sysPath) + if err != nil { + return nil, fmt.Errorf("failed to open procfs: %w", err) + } + + return &watchdogCollector{ + fs: fs, + logger: logger, + }, nil +} + +var ( + watchdogBootstatusDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "bootstatus"), + "Value of /sys/class/watchdog//bootstatus", + []string{"name"}, nil, + ) + watchdogFwVersionDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "fw_version"), + "Value of /sys/class/watchdog//fw_version", + []string{"name"}, nil, + ) + watchdogNowayoutDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "nowayout"), + "Value of /sys/class/watchdog//nowayout", + []string{"name"}, nil, + ) + watchdogTimeleftDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"), + "Value of /sys/class/watchdog//timeleft", + []string{"name"}, nil, + ) + watchdogTimeoutDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"), + "Value of /sys/class/watchdog//timeout", + []string{"name"}, nil, + ) + watchdogPretimeoutDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"), + "Value of /sys/class/watchdog//pretimeout", + []string{"name"}, nil, + ) + watchdogAccessCs0Desc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "access_cs0"), + "Value of /sys/class/watchdog//access_cs0", + []string{"name"}, nil, + ) + watchdogInfoDesc = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "watchdog", "info"), + "Info of /sys/class/watchdog/", + []string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil, + ) +) + +func toLabelValue(ptr *string) string { + if ptr == nil { + return "" + } + return *ptr +} + +func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error { + watchdogClass, err := c.fs.WatchdogClass() + if err != nil { + return err + } + + for _, wd := range watchdogClass { + if wd.Bootstatus != nil { + ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name) + } + if wd.FwVersion != nil { + ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name) + } + if wd.Nowayout != nil { + ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name) + } + if wd.Timeleft != nil { + ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name) + } + if wd.Timeout != nil { + ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name) + } + if wd.Pretimeout != nil { + ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name) + } + if wd.AccessCs0 != nil { + ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name) + } + + ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0, + wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor)) + } + + return nil +} diff --git a/collector/watchdog_test.go b/collector/watchdog_test.go new file mode 100644 index 0000000000..7cf9418574 --- /dev/null +++ b/collector/watchdog_test.go @@ -0,0 +1,92 @@ +// Copyright 2023 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file ewcept in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !nowatchdog +// +build !nowatchdog + +package collector + +import ( + "fmt" + "os" + "strings" + "testing" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" +) + +type testWatchdogCollector struct { + wc Collector +} + +func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) { + c.wc.Update(ch) +} + +func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(c, ch) +} + +func TestWatchdogStats(t *testing.T) { + testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog//access_cs0 + # TYPE node_watchdog_access_cs0 gauge + node_watchdog_access_cs0{name="watchdog0"} 0 + # HELP node_watchdog_bootstatus Value of /sys/class/watchdog//bootstatus + # TYPE node_watchdog_bootstatus gauge + node_watchdog_bootstatus{name="watchdog0"} 1 + # HELP node_watchdog_fw_version Value of /sys/class/watchdog//fw_version + # TYPE node_watchdog_fw_version gauge + node_watchdog_fw_version{name="watchdog0"} 2 + # HELP node_watchdog_info Info of /sys/class/watchdog/ + # TYPE node_watchdog_info gauge + node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1 + node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1 + # HELP node_watchdog_nowayout Value of /sys/class/watchdog//nowayout + # TYPE node_watchdog_nowayout gauge + node_watchdog_nowayout{name="watchdog0"} 0 + # HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog//pretimeout + # TYPE node_watchdog_pretimeout_seconds gauge + node_watchdog_pretimeout_seconds{name="watchdog0"} 120 + # HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog//timeleft + # TYPE node_watchdog_timeleft_seconds gauge + node_watchdog_timeleft_seconds{name="watchdog0"} 300 + # HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog//timeout + # TYPE node_watchdog_timeout_seconds gauge + node_watchdog_timeout_seconds{name="watchdog0"} 60 + ` + *sysPath = "fixtures/sys" + + logger := log.NewLogfmtLogger(os.Stderr) + c, err := NewWatchdogCollector(logger) + if err != nil { + t.Fatal(err) + } + reg := prometheus.NewRegistry() + reg.MustRegister(&testWatchdogCollector{wc: c}) + + sink := make(chan prometheus.Metric) + go func() { + err = c.Update(sink) + if err != nil { + panic(fmt.Errorf("failed to update collector: %s", err)) + } + close(sink) + }() + + err = testutil.GatherAndCompare(reg, strings.NewReader(testcase)) + if err != nil { + t.Fatal(err) + } +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index 60263d54eb..35e4534e6d 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -49,6 +49,7 @@ enabled_collectors=$(cat << COLLECTORS thermal_zone udp_queues vmstat + watchdog wifi xfrm xfs