From 465d27f6d5c94c59c1fd36f7d6bd6068c15d0d55 Mon Sep 17 00:00:00 2001 From: Kevin Petremann Date: Fri, 14 Apr 2023 14:43:39 +0200 Subject: [PATCH] fix: schedule job always success even when failing Schedule returned "success" is always true, even when executed states have failed. However the retcode seems enough to catch issues. --- internal/metrics/metrics.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 27c4860..a2b7a14 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -11,7 +11,7 @@ import ( "github.com/rs/zerolog/log" ) -func ExposeMetrics(ctx context.Context, eventChan chan events.SaltEvent) { +func ExposeMetrics(ctx context.Context, eventChan <-chan events.SaltEvent) { newJobCounter := promauto.NewCounterVec( prometheus.CounterOpts{ Name: "salt_new_job_total", @@ -65,10 +65,15 @@ func ExposeMetrics(ctx context.Context, eventChan chan events.SaltEvent) { case "ret": state := event.ExtractState() if event.IsScheduleJob { + // for scheduled job, when the states in the job actually failed + // - the global "success" value is always true + // - the substate success is false, and the global retcode is > 0 + // using retcode could be enough, but in case there are other corner cases, we combine both values + success := event.Data.Success && (event.Data.Retcode == 0) scheduledJobReturnCounter.WithLabelValues( event.Data.Fun, state, - strconv.FormatBool(event.Data.Success), + strconv.FormatBool(success), ).Inc() } else { sucess := strconv.FormatBool(event.Data.Success)