diff --git a/README.md b/README.md index 94dcabe..34c583c 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,26 @@ salt_new_job_total{function="state.sls",state="test",success="false"} 1 salt_new_job_total{function="state.single",state="test.nop",success="true"} 3 ``` +### Health Minions metrics +By default, the state.highstate will also generate a health metrics: +``` +salt_state_health{function="state.highstate",minion="node1",state="highstate"} 1 +``` +* `1` mean that the last time this couple of function/state were called, the return was `successful` +* `0` mean that the last time this couple of function/state were called, the return was `failed` + +You will find a example of prometheus alerts that could be used with these default metrics in the prometheus_alerts directory. + +The health metrics can be customized by using the -health-functions-filter and -health-states-filter, example of usage: +``` +./salt-exporter -health-states-filter=test.ping,state.apply -health-functions-filter="" +``` +This will only generate health minion metrics for the test.ping function call: +``` +salt_state_health{function="test.ping",minion="node1",state=""} 1 +``` +You can disable all the health metrics with this config switch: +```./salt-exporter -health-minions=false``` ### `salt/job//new` It increases: diff --git a/go.mod b/go.mod index 46779e5..37b8d7b 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.20 require ( github.com/google/go-cmp v0.5.9 github.com/prometheus/client_golang v1.14.0 - github.com/rs/zerolog v1.29.0 + github.com/rs/zerolog v1.29.1 github.com/vmihailenco/msgpack/v5 v5.3.5 ) diff --git a/go.sum b/go.sum index 63cc928..fb2a2b9 100644 --- a/go.sum +++ b/go.sum @@ -3,6 +3,7 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coreos/go-systemd/v22 v22.3.3-0.20220203105225-a9a7ef127534/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -37,6 +38,8 @@ github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.29.0 h1:Zes4hju04hjbvkVkOhdl2HpZa+0PmVwigmo8XoORE5w= github.com/rs/zerolog v1.29.0/go.mod h1:NILgTygv/Uej1ra5XxGf82ZFSLk58MFGAUS2o6usyD0= +github.com/rs/zerolog v1.29.1 h1:cO+d60CHkknCbvzEWxP0S9K6KqyTjrCNUy1LdQLCGPc= +github.com/rs/zerolog v1.29.1/go.mod h1:Le6ESbR7hc+DP6Lt1THiV8CQSdkkNrd3R0XbEgp3ZBU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= diff --git a/internal/metrics/config.go b/internal/metrics/config.go new file mode 100644 index 0000000..a009316 --- /dev/null +++ b/internal/metrics/config.go @@ -0,0 +1,7 @@ +package metrics + +type MetricsConfig struct { + HealthMinions bool + HealthFunctionsFilters []string + HealthStatesFilters []string +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index a2b7a14..31c1fb5 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -11,7 +11,24 @@ import ( "github.com/rs/zerolog/log" ) -func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) { +func boolToFloat64(b bool) float64 { + if b { + return 1.0 + } + return 0.0 +} + +// Function to check if a string exists in a slice of strings +func contains(slice []string, str string) bool { + for _, s := range slice { + if s == str { + return true + } + } + return false +} + +func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent, metricsConfig MetricsConfig) { newJobCounter := promauto.NewCounterVec( prometheus.CounterOpts{ Name: "salt_new_job_total", @@ -49,6 +66,14 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) { []string{"function", "state"}, ) + lastStateHealth := promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "salt_state_health", + Help: "Last state success state, 0=Failed, 1=Success", + }, + []string{"minion", "function", "state"}, + ) + for { select { case <-ctx.Done(): @@ -76,16 +101,27 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) { strconv.FormatBool(success), ).Inc() } else { - sucess := strconv.FormatBool(event.Data.Success) + success := strconv.FormatBool(event.Data.Success) + responsesCounter.WithLabelValues( event.Data.Id, - sucess, + success, ).Inc() functionResponsesCounter.WithLabelValues( event.Data.Fun, state, - sucess, + success, ).Inc() + + if metricsConfig.HealthMinions { + if contains(metricsConfig.HealthFunctionsFilters, event.Data.Fun) && + contains(metricsConfig.HealthStatesFilters, state) { + lastStateHealth.WithLabelValues( + event.Data.Id, + event.Data.Fun, + state).Set(boolToFloat64(event.Data.Success)) + } + } } } diff --git a/main.go b/main.go index 613e27d..fe3bd52 100644 --- a/main.go +++ b/main.go @@ -7,6 +7,7 @@ import ( "net/http" "os" "os/signal" + "strings" "syscall" "github.com/kpetremann/salt-exporter/internal/logging" @@ -28,6 +29,11 @@ func main() { tlsEnabled := flag.Bool("tls", false, "enable TLS") tlsCert := flag.String("tls-cert", "", "TLS certificated") tlsKey := flag.String("tls-key", "", "TLS private key") + healthMinions := flag.Bool("health-minions", true, "Enable health metric for each minion") + healthFunctionsFilters := flag.String("health-functions-filter", "state.highstate", + "Apply filter on functions to monitor, separated by a comma") + healthStatesFilters := flag.String("health-states-filter", "highstate", + "Apply filter on states to monitor, separated by a comma") flag.Parse() logging.ConfigureLogging() @@ -47,6 +53,17 @@ func main() { } } + var metricsConfig metrics.MetricsConfig + metricsConfig.HealthMinions = *healthMinions + metricsConfig.HealthFunctionsFilters = strings.Split(*healthFunctionsFilters, ",") + metricsConfig.HealthStatesFilters = strings.Split(*healthStatesFilters, ",") + + if metricsConfig.HealthMinions { + log.Info().Msg("health-minions: metrics are enabled") + log.Info().Msg("health-minions: functions filters: " + *healthFunctionsFilters) + log.Info().Msg("health-minions: states filters: " + *healthStatesFilters) + } + listenSocket := fmt.Sprint(*listenAddress, ":", *listenPort) ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) @@ -59,7 +76,7 @@ func main() { eventListener := events.NewEventListener(ctx, eventChan) go eventListener.ListenEvents() - go metrics.ExposeMetrics(ctx, eventChan) + go metrics.ExposeMetrics(ctx, eventChan, metricsConfig) // start http server log.Info().Msg("exposing metrics on " + listenSocket + "/metrics") diff --git a/prometheus_alerts/highstate.yaml b/prometheus_alerts/highstate.yaml new file mode 100644 index 0000000..17b280f --- /dev/null +++ b/prometheus_alerts/highstate.yaml @@ -0,0 +1,20 @@ +groups: + - name: saltstack + rules: + - alert: SaltExporterLastHighstateSuccess + expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0) + for: 60m + labels: + severity: critical + minion: "{{ $labels.minion }}" + annotations: + summary: "Salt Last Successful Highstate Failed (minion {{ $labels.minion }})" + description: "Salt Last Successful Highstate failed since > 60m" + - alert: SaltExporterLastHighstateSuccessInfo + expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0) + for: 10m + labels: + severity: info + minion: "{{ $labels.minion }}" + annotations: + summary: "Salt Last Successful Highstate Failed (minion {{ $labels.minion }})" \ No newline at end of file