From fbc4741720e214d57eecdc1f99f7b54f9aafb4e5 Mon Sep 17 00:00:00 2001 From: Kevin Petremann Date: Mon, 17 Apr 2023 11:27:13 +0200 Subject: [PATCH] Rename salt_state_health to salt_function_health Because it is not only for state. Note: for a state, there is always a function. --- README.md | 24 +++++++++++++++--------- internal/metrics/metrics.go | 17 ++++++++--------- prometheus_alerts/highstate.yaml | 4 ++-- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index c7b6d82..a62ce11 100644 --- a/README.md +++ b/README.md @@ -97,26 +97,32 @@ salt_new_job_total{function="state.sls",state="test",success="false"} 1 salt_new_job_total{function="state.single",state="test.nop",success="true"} 3 ``` -### Health Minions metrics -By default, the state.highstate will also generate a health metrics: +### Minions job status + +By default, a Salt highstate will generate a status metric: ``` -salt_state_health{function="state.highstate",minion="node1",state="highstate"} 1 +salt_function_status{function="state.highstate",minion="node1",state="highstate"} 1 ``` -* `1` mean that the last time this couple of function/state were called, the return was `successful` -* `0` mean that the last time this couple of function/state were called, the return was `failed` +* `1` means that the last time this couple of function/state were executed, the return was `successful` +* `0` means that the last time this couple of function/state were executed, the return was `failed` -You will find a example of prometheus alerts that could be used with these default metrics in the prometheus_alerts directory. +You will find an example of Prometheus alerts that could be used with this metric in the `prometheus_alerts` directory. -The health metrics can be customized by using the -health-functions-filter and -health-states-filter, example of usage: +The health metrics can be customized by using the `-health-functions-filter` and `-health-states-filter`, example of usage: ``` ./salt-exporter -health-states-filter=test.ping,state.apply -health-functions-filter="" ``` -This will only generate health minion metrics for the test.ping function call: + +This will only generate a metric for the `test.ping` function executed: ``` -salt_state_health{function="test.ping",minion="node1",state=""} 1 +salt_function_status{function="test.ping",minion="node1",state=""} 1 ``` + You can disable all the health metrics with this config switch: ```./salt-exporter -health-minions=false``` + +Note: this also works for scheduled jobs. + ### `salt/job//new` It increases: diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 06c833c..ffd526d 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -50,6 +50,13 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent, metri }, []string{"function", "state", "success"}, ) + lastFunctionHealth := promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "salt_function_status", + Help: "Last state success function, 0=Failed, 1=Success", + }, + []string{"minion", "function", "state"}, + ) scheduledJobReturnCounter := promauto.NewCounterVec( prometheus.CounterOpts{ @@ -66,14 +73,6 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent, metri []string{"function", "state"}, ) - lastStateHealth := promauto.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "salt_job_health", - Help: "Last state success state, 0=Failed, 1=Success", - }, - []string{"minion", "function", "state"}, - ) - for { select { case <-ctx.Done(): @@ -117,7 +116,7 @@ func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent, metri // Expose state/func status if metricsConfig.HealthMinions { if contains(metricsConfig.HealthFunctionsFilters, event.Data.Fun) && contains(metricsConfig.HealthStatesFilters, state) { - lastStateHealth.WithLabelValues( + lastFunctionHealth.WithLabelValues( event.Data.Id, event.Data.Fun, state).Set(boolToFloat64(event.Data.Success)) diff --git a/prometheus_alerts/highstate.yaml b/prometheus_alerts/highstate.yaml index 17b280f..5f061f7 100644 --- a/prometheus_alerts/highstate.yaml +++ b/prometheus_alerts/highstate.yaml @@ -2,7 +2,7 @@ groups: - name: saltstack rules: - alert: SaltExporterLastHighstateSuccess - expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0) + expr: sum by(minion) (salt_function_health{function="state.highstate", state="highstate"} == 0) for: 60m labels: severity: critical @@ -11,7 +11,7 @@ groups: summary: "Salt Last Successful Highstate Failed (minion {{ $labels.minion }})" description: "Salt Last Successful Highstate failed since > 60m" - alert: SaltExporterLastHighstateSuccessInfo - expr: sum by(minion) (salt_state_health{function="state.highstate", state="highstate"} == 0) + expr: sum by(minion) (salt_function_health{function="state.highstate", state="highstate"} == 0) for: 10m labels: severity: info