diff --git a/e2e/nomostest/prometheus_metrics.go b/e2e/nomostest/prometheus_metrics.go index 36bf9b938f..ff049bb565 100644 --- a/e2e/nomostest/prometheus_metrics.go +++ b/e2e/nomostest/prometheus_metrics.go @@ -344,42 +344,42 @@ func metricResourceFightsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.Labe labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), }.Merge(syncLabels) - // ResourceFightsView counts the total number of ResourceFights, so we don't need to aggregate - query := fmt.Sprintf("%s%s", metricName, labels) if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) + query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) } + query := fmt.Sprintf("%s%s", metricName, labels) return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } // metricResourceConflictsHasValueAtLeast returns a MetricsPredicate that -// validates that ResourceConflicts has at least the expected value. -// If the expected value is zero, the metric must be zero or not found. +// validates whether ResourceConflicts has been recorded recently +// If the expected value is zero, the rate at which ResourceConflicts occur is also zero. func metricResourceConflictsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, commitHash string, value int) MetricsPredicate { return func(ctx context.Context, v1api prometheusv1.API) error { metricName := ocmetrics.ResourceConflictsView.Name metricName = fmt.Sprintf("%s%s", prometheusConfigSyncMetricPrefix, metricName) + if value == 0 { + labels := prometheusmodel.LabelSet{ + prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), + prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): ocmetrics.Initialization, + }.Merge(syncLabels) + query := fmt.Sprintf("rate(%s%s[30s])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) + } labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): prometheusmodel.LabelValue(commitHash), }.Merge(syncLabels) - // ResourceConflictsView counts the total number of ResourceConflicts, so we don't need to aggregate query := fmt.Sprintf("%s%s", metricName, labels) - if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) - } return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } // metricInternalErrorsHasValueAtLeast returns a MetricsPredicate that validates -// that InternalErrors has at least the expected value. -// If the expected value is zero, the metric must be zero or not found. +// whether InternalErrors has been recorded recently. +// If the expected value is zero, the rate at which InternalErrors occur is also zero. func metricInternalErrorsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, value int) MetricsPredicate { return func(ctx context.Context, v1api prometheusv1.API) error { metricName := ocmetrics.InternalErrorsView.Name @@ -387,13 +387,11 @@ func metricInternalErrorsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.Labe labels := prometheusmodel.LabelSet{ prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName), }.Merge(syncLabels) - // InternalErrorsView counts the total number of InternalErrors, so we don't need to aggregate - query := fmt.Sprintf("%s%s", metricName, labels) if value == 0 { - // Tolerate missing metrics when expecting a zero value. - // Don't allow any value other than zero. - return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value)) + query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels) + return metricExistsWithValue(ctx, nt, v1api, query, 0) } + query := fmt.Sprintf("%s%s", metricName, labels) return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value)) } } diff --git a/pkg/metrics/record.go b/pkg/metrics/record.go index ed8b9fe0af..5e07f6fde5 100644 --- a/pkg/metrics/record.go +++ b/pkg/metrics/record.go @@ -27,6 +27,9 @@ import ( "kpt.dev/configsync/pkg/status" ) +// Initialization is the tag value to initiate a Count +const Initialization = "cs_counter_initialization" + func record(ctx context.Context, ms ...stats.Measurement) { stats.Record(ctx, ms...) if klog.V(5).Enabled() { diff --git a/pkg/parse/run.go b/pkg/parse/run.go index ba48f9be83..b1b94325a8 100644 --- a/pkg/parse/run.go +++ b/pkg/parse/run.go @@ -78,6 +78,12 @@ type RunFunc func(ctx context.Context, p Parser, trigger string, state *reconcil // Run keeps checking whether a parse-apply-watch loop is necessary and starts a loop if needed. func Run(ctx context.Context, p Parser, nsControllerState *namespacecontroller.State, runOpts RunOpts) { + // Pre-initialize count metrics to ensure visibility, even if errors haven't occurred. + // Special label indicates initial state. Metrics are cumulative and used for rate aggregation + // to detect ongoing errors. Zero value means no recent errors. + metrics.RecordResourceFight(ctx, metrics.Initialization) + metrics.RecordInternalError(ctx, metrics.Initialization) + metrics.RecordResourceConflict(ctx, metrics.Initialization) opts := p.options() // Use timers, not tickers. // Tickers can cause memory leaks and continuous execution, when execution