From 4ee97b8f14486b8f8e9e9cc8acc6e0d28fc2005e Mon Sep 17 00:00:00 2001 From: Tiffany Pei Date: Tue, 28 May 2024 21:43:10 +0000 Subject: [PATCH] [metric] #4 Initialize count metrics on reconciler start The Count views won't show up in query results until data is sent when error condition happens. This change initialize these metrics and adjust the e2e test to check for Rate aggregation. The public documentation should also be revised to recommend the Rate aggregation for this type of metrics. --- e2e/nomostest/prometheus_metrics.go | 38 ++++++++++++++--------------- pkg/metrics/record.go | 3 +++ pkg/parse/run.go | 6 +++++ 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/e2e/nomostest/prometheus_metrics.go b/e2e/nomostest/prometheus_metrics.go index 36bf9b938f..ff049bb565 100644 --- a/e2e/nomostest/prometheus_metrics.go +++ b/e2e/nomostest/prometheus_metrics.go @@ -344,42 +344,42 @@ func metricResourceFightsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.Labe labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), }.Merge(syncLabels) - // ResourceFightsView counts the total number of ResourceFights, so we don't need to aggregate - query := fmt.Sprintf("%s%s", metricName, labels) if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) + query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) } + query := fmt.Sprintf("%s%s", metricName, labels) return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } // metricResourceConflictsHasValueAtLeast returns a MetricsPredicate that -// validates that ResourceConflicts has at least the expected value. -// If the expected value is zero, the metric must be zero or not found. +// validates whether ResourceConflicts has been recorded recently +// If the expected value is zero, the rate at which ResourceConflicts occur is also zero. func metricResourceConflictsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, commitHash string, value int) MetricsPredicate { return func(ctx context.Context, v1api prometheusv1.API) error { metricName := ocmetrics.ResourceConflictsView.Name metricName = fmt.Sprintf("%s%s", prometheusConfigSyncMetricPrefix, metricName) + if value == 0 { + labels := prometheusmodel.LabelSet{ + prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), + prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): ocmetrics.Initialization, + }.Merge(syncLabels) + query := fmt.Sprintf("rate(%s%s[30s])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) + } labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): prometheusmodel.LabelValue(commitHash), }.Merge(syncLabels) - // ResourceConflictsView counts the total number of ResourceConflicts, so we don't need to aggregate query := fmt.Sprintf("%s%s", metricName, labels) - if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) - } return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } // metricInternalErrorsHasValueAtLeast returns a MetricsPredicate that validates -// that InternalErrors has at least the expected value. -// If the expected value is zero, the metric must be zero or not found. +// whether InternalErrors has been recorded recently. +// If the expected value is zero, the rate at which InternalErrors occur is also zero. func metricInternalErrorsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, value int) MetricsPredicate { return func(ctx context.Context, v1api prometheusv1.API) error { metricName := ocmetrics.InternalErrorsView.Name @@ -387,13 +387,11 @@ func metricInternalErrorsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.Labe labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), }.Merge(syncLabels) - // InternalErrorsView counts the total number of InternalErrors, so we don't need to aggregate - query := fmt.Sprintf("%s%s", metricName, labels) if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) + query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) } + query := fmt.Sprintf("%s%s", metricName, labels) return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } diff --git a/pkg/metrics/record.go b/pkg/metrics/record.go index ed8b9fe0af..5e07f6fde5 100644 --- a/pkg/metrics/record.go +++ b/pkg/metrics/record.go @@ -27,6 +27,9 @@ import ( "kpt.dev/configsync/pkg/status" ) +// Initialization is the tag value to initiate a Count +const Initialization = "cs_counter_initialization" + func record(ctx context.Context, ms ...stats.Measurement) { stats.Record(ctx, ms...) if klog.V(5).Enabled() { diff --git a/pkg/parse/run.go b/pkg/parse/run.go index ba48f9be83..b1b94325a8 100644 --- a/pkg/parse/run.go +++ b/pkg/parse/run.go @@ -78,6 +78,12 @@ type RunFunc func(ctx context.Context, p Parser, trigger string, state *reconcil // Run keeps checking whether a parse-apply-watch loop is necessary and starts a loop if needed. func Run(ctx context.Context, p Parser, nsControllerState *namespacecontroller.State, runOpts RunOpts) { + // Pre-initialize count metrics to ensure visibility, even if errors haven't occurred. + // Special label indicates initial state. Metrics are cumulative and used for rate aggregation + // to detect ongoing errors. Zero value means no recent errors. + metrics.RecordResourceFight(ctx, metrics.Initialization) + metrics.RecordInternalError(ctx, metrics.Initialization) + metrics.RecordResourceConflict(ctx, metrics.Initialization) opts := p.options() // Use timers, not tickers. // Tickers can cause memory leaks and continuous execution, when execution