Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add salt_state_health gaugeVec #7

Merged
merged 3 commits into from
Apr 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,26 @@ salt_new_job_total{function="state.sls",state="test",success="false"} 1
salt_new_job_total{function="state.single",state="test.nop",success="true"} 3
```

### Health Minions metrics
By default, the state.highstate will also generate a health metrics:
```
salt_state_health{function="state.highstate",minion="node1",state="highstate"} 1
```
* `1` mean that the last time this couple of function/state were called, the return was `successful`
* `0` mean that the last time this couple of function/state were called, the return was `failed`

You will find a example of prometheus alerts that could be used with these default metrics in the prometheus_alerts directory.

The health metrics can be customized by using the -health-functions-filter and -health-states-filter, example of usage:
```
./salt-exporter -health-states-filter=test.ping,state.apply -health-functions-filter=""
```
This will only generate health minion metrics for the test.ping function call:
```
salt_state_health{function="test.ping",minion="node1",state=""} 1
```
You can disable all the health metrics with this config switch:
```./salt-exporter -health-minions=false```
### `salt/job/<jid>/new`

It increases:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ go 1.20
require (
github.com/google/go-cmp v0.5.9
github.com/prometheus/client_golang v1.14.0
github.com/rs/zerolog v1.29.0
github.com/rs/zerolog v1.29.1
github.com/vmihailenco/msgpack/v5 v5.3.5
)

Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/coreos/go-systemd/v22 v22.3.3-0.20220203105225-a9a7ef127534/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
Expand Down Expand Up @@ -37,6 +38,8 @@ github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB
github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.29.0 h1:Zes4hju04hjbvkVkOhdl2HpZa+0PmVwigmo8XoORE5w=
github.com/rs/zerolog v1.29.0/go.mod h1:NILgTygv/Uej1ra5XxGf82ZFSLk58MFGAUS2o6usyD0=
github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc=
github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk=
Expand Down
7 changes: 7 additions & 0 deletions internal/metrics/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package metrics

type MetricsConfig struct {
HealthMinions bool
HealthFunctionsFilters []string
HealthStatesFilters []string
}
44 changes: 40 additions & 4 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,24 @@ import (
"github.com/rs/zerolog/log"
)

func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) {
func boolToFloat64(b bool) float64 {
if b {
return 1.0
}
return 0.0
}

// Function to check if a string exists in a slice of strings
func contains(slice []string, str string) bool {
for _, s := range slice {
if s == str {
return true
}
}
return false
}

func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent, metricsConfig MetricsConfig) {
newJobCounter := promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "salt_new_job_total",
Expand Down Expand Up @@ -49,6 +66,14 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) {
[]string{"function", "state"},
)

lastStateHealth := promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "salt_state_health",
Help: "Last state success state, 0=Failed, 1=Success",
},
[]string{"minion", "function", "state"},
)

for {
select {
case <-ctx.Done():
Expand Down Expand Up @@ -76,16 +101,27 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) {
strconv.FormatBool(success),
).Inc()
} else {
sucess := strconv.FormatBool(event.Data.Success)
success := strconv.FormatBool(event.Data.Success)

responsesCounter.WithLabelValues(
event.Data.Id,
sucess,
success,
).Inc()
functionResponsesCounter.WithLabelValues(
event.Data.Fun,
state,
sucess,
success,
).Inc()

if metricsConfig.HealthMinions {
if contains(metricsConfig.HealthFunctionsFilters, event.Data.Fun) &&
contains(metricsConfig.HealthStatesFilters, state) {
lastStateHealth.WithLabelValues(
event.Data.Id,
event.Data.Fun,
state).Set(boolToFloat64(event.Data.Success))
}
}
}
}

Expand Down
19 changes: 18 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"os"
"os/signal"
"strings"
"syscall"

"github.com/kpetremann/salt-exporter/internal/logging"
Expand All @@ -28,6 +29,11 @@ func main() {
tlsEnabled := flag.Bool("tls", false, "enable TLS")
tlsCert := flag.String("tls-cert", "", "TLS certificated")
tlsKey := flag.String("tls-key", "", "TLS private key")
healthMinions := flag.Bool("health-minions", true, "Enable health metric for each minion")
healthFunctionsFilters := flag.String("health-functions-filter", "state.highstate",
"Apply filter on functions to monitor, separated by a comma")
healthStatesFilters := flag.String("health-states-filter", "highstate",
"Apply filter on states to monitor, separated by a comma")
flag.Parse()

logging.ConfigureLogging()
Expand All @@ -47,6 +53,17 @@ func main() {
}
}

var metricsConfig metrics.MetricsConfig
metricsConfig.HealthMinions = *healthMinions
metricsConfig.HealthFunctionsFilters = strings.Split(*healthFunctionsFilters, ",")
metricsConfig.HealthStatesFilters = strings.Split(*healthStatesFilters, ",")

if metricsConfig.HealthMinions {
log.Info().Msg("health-minions: metrics are enabled")
log.Info().Msg("health-minions: functions filters: " + *healthFunctionsFilters)
log.Info().Msg("health-minions: states filters: " + *healthStatesFilters)
}

listenSocket := fmt.Sprint(*listenAddress, ":", *listenPort)

ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
Expand All @@ -59,7 +76,7 @@ func main() {
eventListener := events.NewEventListener(ctx, eventChan)

go eventListener.ListenEvents()
go metrics.ExposeMetrics(ctx, eventChan)
go metrics.ExposeMetrics(ctx, eventChan, metricsConfig)

// start http server
log.Info().Msg("exposing metrics on " + listenSocket + "/metrics")
Expand Down
20 changes: 20 additions & 0 deletions prometheus_alerts/highstate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
groups:
- name: saltstack
rules:
- alert: SaltExporterLastHighstateSuccess
expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0)
for: 60m
labels:
severity: critical
minion: "{{ $labels.minion }}"
annotations:
summary: "Salt Last Successful Highstate Failed (minion {{ $labels.minion }})"
description: "Salt Last Successful Highstate failed since > 60m"
- alert: SaltExporterLastHighstateSuccessInfo
expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0)
for: 10m
labels:
severity: info
minion: "{{ $labels.minion }}"
annotations:
summary: "Salt Last Successful Highstate Failed (minion {{ $labels.minion }})"