Skip to content

Commit

Permalink
[metric] #4 Initialize count metrics on reconciler start
Browse files Browse the repository at this point in the history
The Count views won't show up in query results until data is sent when error
condition happens.

This change initialize these metrics and adjust the e2e test to check
for Rate aggregation.

The public documentation should also be revised to recommend the Rate
aggregation for this type of metrics.
  • Loading branch information
tiffanny29631 committed May 28, 2024
1 parent 98842b4 commit 4ee97b8
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 20 deletions.
38 changes: 18 additions & 20 deletions e2e/nomostest/prometheus_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,56 +344,54 @@ func metricResourceFightsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.Labe
labels := prometheusmodel.LabelSet{
prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName),
}.Merge(syncLabels)
// ResourceFightsView counts the total number of ResourceFights, so we don't need to aggregate
query := fmt.Sprintf("%s%s", metricName, labels)
if value == 0 {
// Tolerate missing metrics when expecting a zero value.
// Don't allow any value other than zero.
return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value))
query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels)
return metricExistsWithValue(ctx, nt, v1api, query, 0)
}
query := fmt.Sprintf("%s%s", metricName, labels)
return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value))
}
}

// metricResourceConflictsHasValueAtLeast returns a MetricsPredicate that
// validates that ResourceConflicts has at least the expected value.
// If the expected value is zero, the metric must be zero or not found.
// validates whether ResourceConflicts has been recorded recently
// If the expected value is zero, the rate at which ResourceConflicts occur is also zero.
func metricResourceConflictsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, commitHash string, value int) MetricsPredicate {
return func(ctx context.Context, v1api prometheusv1.API) error {
metricName := ocmetrics.ResourceConflictsView.Name
metricName = fmt.Sprintf("%s%s", prometheusConfigSyncMetricPrefix, metricName)
if value == 0 {
labels := prometheusmodel.LabelSet{
prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName),
prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): ocmetrics.Initialization,
}.Merge(syncLabels)
query := fmt.Sprintf("rate(%s%s[30s])", metricName, labels)
return metricExistsWithValue(ctx, nt, v1api, query, 0)
}
labels := prometheusmodel.LabelSet{
prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName),
prometheusmodel.LabelName(ocmetrics.KeyCommit.Name()): prometheusmodel.LabelValue(commitHash),
}.Merge(syncLabels)
// ResourceConflictsView counts the total number of ResourceConflicts, so we don't need to aggregate
query := fmt.Sprintf("%s%s", metricName, labels)
if value == 0 {
// Tolerate missing metrics when expecting a zero value.
// Don't allow any value other than zero.
return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value))
}
return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value))
}
}

// metricInternalErrorsHasValueAtLeast returns a MetricsPredicate that validates
// that InternalErrors has at least the expected value.
// If the expected value is zero, the metric must be zero or not found.
// whether InternalErrors has been recorded recently.
// If the expected value is zero, the rate at which InternalErrors occur is also zero.
func metricInternalErrorsHasValueAtLeast(nt *NT, syncLabels prometheusmodel.LabelSet, value int) MetricsPredicate {
return func(ctx context.Context, v1api prometheusv1.API) error {
metricName := ocmetrics.InternalErrorsView.Name
metricName = fmt.Sprintf("%s%s", prometheusConfigSyncMetricPrefix, metricName)
labels := prometheusmodel.LabelSet{
prometheusmodel.LabelName(ocmetrics.KeyComponent.Name()): prometheusmodel.LabelValue(ocmetrics.OtelCollectorName),
}.Merge(syncLabels)
// InternalErrorsView counts the total number of InternalErrors, so we don't need to aggregate
query := fmt.Sprintf("%s%s", metricName, labels)
if value == 0 {
// Tolerate missing metrics when expecting a zero value.
// Don't allow any value other than zero.
return metricExistsWithValueOrDoesNotExist(ctx, nt, v1api, query, float64(value))
query := fmt.Sprintf("rate(%s%s[1m])", metricName, labels)
return metricExistsWithValue(ctx, nt, v1api, query, 0)
}
query := fmt.Sprintf("%s%s", metricName, labels)
return metricExistsWithValueAtLeast(ctx, nt, v1api, query, float64(value))
}
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/metrics/record.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ import (
"kpt.dev/configsync/pkg/status"
)

// Initialization is the tag value to initiate a Count
const Initialization = "cs_counter_initialization"

func record(ctx context.Context, ms ...stats.Measurement) {
stats.Record(ctx, ms...)
if klog.V(5).Enabled() {
Expand Down
6 changes: 6 additions & 0 deletions pkg/parse/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ type RunFunc func(ctx context.Context, p Parser, trigger string, state *reconcil

// Run keeps checking whether a parse-apply-watch loop is necessary and starts a loop if needed.
func Run(ctx context.Context, p Parser, nsControllerState *namespacecontroller.State, runOpts RunOpts) {
// Pre-initialize count metrics to ensure visibility, even if errors haven't occurred.
// Special label indicates initial state. Metrics are cumulative and used for rate aggregation
// to detect ongoing errors. Zero value means no recent errors.
metrics.RecordResourceFight(ctx, metrics.Initialization)
metrics.RecordInternalError(ctx, metrics.Initialization)
metrics.RecordResourceConflict(ctx, metrics.Initialization)
opts := p.options()
// Use timers, not tickers.
// Tickers can cause memory leaks and continuous execution, when execution
Expand Down

0 comments on commit 4ee97b8

Please sign in to comment.