From f745706e08ff294653c825243f6744c587f0ffd9 Mon Sep 17 00:00:00 2001 From: Alexandre Gaudreault Date: Wed, 14 Feb 2024 16:51:14 +0000 Subject: [PATCH] fix(controller): add missing workqueue metrics (#16315) (#17013) * fix(controller): add missing kubernetes metrics Signed-off-by: Alexandre Gaudreault * validate workqueue metrics are present Signed-off-by: Alexandre Gaudreault * use newer metrics registry Signed-off-by: Alexandre Gaudreault * fix duplicated Signed-off-by: Alexandre Gaudreault * init runtime controller in test to have correct metrics Signed-off-by: Alexandre Gaudreault * fix lint error Signed-off-by: Alexandre Gaudreault * update controller-runtime to remove metrics with high cardinality Signed-off-by: Alexandre Gaudreault --------- Signed-off-by: Alexandre Gaudreault Signed-off-by: Alexandre Gaudreault --- controller/metrics/metrics.go | 8 ++- controller/metrics/metrics_test.go | 79 +++++++++++++++++++++- controller/metrics/workqueue.go | 101 ----------------------------- go.mod | 4 +- go.sum | 4 +- 5 files changed, 87 insertions(+), 109 deletions(-) delete mode 100644 controller/metrics/workqueue.go diff --git a/controller/metrics/metrics.go b/controller/metrics/metrics.go index e4ef09552c09d..94405b51eac75 100644 --- a/controller/metrics/metrics.go +++ b/controller/metrics/metrics.go @@ -23,6 +23,8 @@ import ( "github.com/argoproj/argo-cd/v2/util/git" "github.com/argoproj/argo-cd/v2/util/healthz" "github.com/argoproj/argo-cd/v2/util/profile" + + ctrl_metrics "sigs.k8s.io/controller-runtime/pkg/metrics" ) type MetricsServer struct { @@ -160,12 +162,12 @@ func NewMetricsServer(addr string, appLister applister.ApplicationLister, appFil mux := http.NewServeMux() registry := NewAppRegistry(appLister, appFilter, appLabels) - registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries) + mux.Handle(MetricsPath, promhttp.HandlerFor(prometheus.Gatherers{ // contains app controller specific metrics registry, - // contains process, golang and controller workqueues metrics - prometheus.DefaultGatherer, + // contains workqueue metrics, process and golang metrics + ctrl_metrics.Registry, }, promhttp.HandlerOpts{})) profile.RegisterProfiler(mux) healthz.ServeHealthCheck(mux, healthCheck) diff --git a/controller/metrics/metrics_test.go b/controller/metrics/metrics_test.go index 61a99a46492a2..23628c38347a5 100644 --- a/controller/metrics/metrics_test.go +++ b/controller/metrics/metrics_test.go @@ -2,6 +2,7 @@ package metrics import ( "context" + "fmt" "log" "net/http" "net/http/httptest" @@ -15,12 +16,15 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" "sigs.k8s.io/yaml" argoappv1 "github.com/argoproj/argo-cd/v2/pkg/apis/application/v1alpha1" appclientset "github.com/argoproj/argo-cd/v2/pkg/client/clientset/versioned/fake" appinformer "github.com/argoproj/argo-cd/v2/pkg/client/informers/externalversions" applister "github.com/argoproj/argo-cd/v2/pkg/client/listers/application/v1alpha1" + + "sigs.k8s.io/controller-runtime/pkg/controller" ) const fakeApp = ` @@ -140,6 +144,12 @@ var appFilter = func(obj interface{}) bool { return true } +func init() { + // Create a fake controller so we initialize the internal controller metrics. + // https://github.com/kubernetes-sigs/controller-runtime/blob/4000e996a202917ad7d40f02ed8a2079a9ce25e9/pkg/internal/controller/metrics/metrics.go + _, _ = controller.New("test-controller", nil, controller.Options{}) +} + func newFakeApp(fakeAppYAML string) *argoappv1.Application { var app argoappv1.Application err := yaml.Unmarshal([]byte(fakeAppYAML), &app) @@ -360,7 +370,7 @@ func assertMetricsPrinted(t *testing.T, expectedLines, body string) { if line == "" { continue } - assert.Contains(t, body, line, "expected metrics mismatch") + assert.Contains(t, body, line, fmt.Sprintf("expected metrics mismatch for line: %s", line)) } } @@ -443,3 +453,70 @@ argocd_app_sync_total{dest_server="https://localhost:6443",name="my-app",namespa err = metricsServ.SetExpiration(time.Second) assert.Error(t, err) } + +func TestWorkqueueMetrics(t *testing.T) { + cancel, appLister := newFakeLister() + defer cancel() + metricsServ, err := NewMetricsServer("localhost:8082", appLister, appFilter, noOpHealthCheck, []string{}) + assert.NoError(t, err) + + expectedMetrics := ` +# TYPE workqueue_adds_total counter +workqueue_adds_total{name="test"} + +# TYPE workqueue_depth gauge +workqueue_depth{name="test"} + +# TYPE workqueue_longest_running_processor_seconds gauge +workqueue_longest_running_processor_seconds{name="test"} + +# TYPE workqueue_queue_duration_seconds histogram + +# TYPE workqueue_unfinished_work_seconds gauge +workqueue_unfinished_work_seconds{name="test"} + +# TYPE workqueue_work_duration_seconds histogram +` + workqueue.NewNamed("test") + + req, err := http.NewRequest(http.MethodGet, "/metrics", nil) + assert.NoError(t, err) + rr := httptest.NewRecorder() + metricsServ.Handler.ServeHTTP(rr, req) + assert.Equal(t, rr.Code, http.StatusOK) + body := rr.Body.String() + log.Println(body) + assertMetricsPrinted(t, expectedMetrics, body) +} + +func TestGoMetrics(t *testing.T) { + cancel, appLister := newFakeLister() + defer cancel() + metricsServ, err := NewMetricsServer("localhost:8082", appLister, appFilter, noOpHealthCheck, []string{}) + assert.NoError(t, err) + + expectedMetrics := ` +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds_sum +go_gc_duration_seconds_count +# TYPE go_goroutines gauge +go_goroutines +# TYPE go_info gauge +go_info +# TYPE go_memstats_alloc_bytes gauge +go_memstats_alloc_bytes +# TYPE go_memstats_sys_bytes gauge +go_memstats_sys_bytes +# TYPE go_threads gauge +go_threads +` + + req, err := http.NewRequest(http.MethodGet, "/metrics", nil) + assert.NoError(t, err) + rr := httptest.NewRecorder() + metricsServ.Handler.ServeHTTP(rr, req) + assert.Equal(t, rr.Code, http.StatusOK) + body := rr.Body.String() + log.Println(body) + assertMetricsPrinted(t, expectedMetrics, body) +} diff --git a/controller/metrics/workqueue.go b/controller/metrics/workqueue.go deleted file mode 100644 index 2ef10685ee47d..0000000000000 --- a/controller/metrics/workqueue.go +++ /dev/null @@ -1,101 +0,0 @@ -package metrics - -import ( - "github.com/prometheus/client_golang/prometheus" - "k8s.io/client-go/util/workqueue" -) - -const ( - WorkQueueSubsystem = "workqueue" - DepthKey = "depth" - AddsKey = "adds_total" - QueueLatencyKey = "queue_duration_seconds" - WorkDurationKey = "work_duration_seconds" - UnfinishedWorkKey = "unfinished_work_seconds" - LongestRunningProcessorKey = "longest_running_processor_seconds" - RetriesKey = "retries_total" -) - -var ( - depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: DepthKey, - Help: "Current depth of workqueue", - }, []string{"name"}) - - adds = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: AddsKey, - Help: "Total number of adds handled by workqueue", - }, []string{"name"}) - - latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: QueueLatencyKey, - Help: "How long in seconds an item stays in workqueue before being requested", - Buckets: []float64{1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 30, 60, 120, 180}, - }, []string{"name"}) - - workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ - Subsystem: WorkQueueSubsystem, - Name: WorkDurationKey, - Help: "How long in seconds processing an item from workqueue takes.", - Buckets: []float64{1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 30, 60, 120, 180}, - }, []string{"name"}) - - unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: UnfinishedWorkKey, - Help: "How many seconds of work has been done that " + - "is in progress and hasn't been observed by work_duration. Large " + - "values indicate stuck threads. One can deduce the number of stuck " + - "threads by observing the rate at which this increases.", - }, []string{"name"}) - - longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Subsystem: WorkQueueSubsystem, - Name: LongestRunningProcessorKey, - Help: "How many seconds has the longest running " + - "processor for workqueue been running.", - }, []string{"name"}) - - retries = prometheus.NewCounterVec(prometheus.CounterOpts{ - Subsystem: WorkQueueSubsystem, - Name: RetriesKey, - Help: "Total number of retries handled by workqueue", - }, []string{"name"}) -) - -func init() { - workqueue.SetProvider(workqueueMetricsProvider{}) -} - -type workqueueMetricsProvider struct{} - -func (workqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { - return depth.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { - return adds.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { - return latency.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { - return workDuration.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { - return unfinished.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { - return longestRunningProcessor.WithLabelValues(name) -} - -func (workqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { - return retries.WithLabelValues(name) -} diff --git a/go.mod b/go.mod index 9be13ee46ac1b..f6753b5e3afd9 100644 --- a/go.mod +++ b/go.mod @@ -92,7 +92,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.26.11 - k8s.io/apiextensions-apiserver v0.26.4 + k8s.io/apiextensions-apiserver v0.26.10 k8s.io/apimachinery v0.26.11 k8s.io/apiserver v0.26.11 k8s.io/client-go v0.26.11 @@ -103,7 +103,7 @@ require ( k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 layeh.com/gopher-json v0.0.0-20190114024228-97fed8db8427 oras.land/oras-go/v2 v2.3.0 - sigs.k8s.io/controller-runtime v0.14.6 + sigs.k8s.io/controller-runtime v0.14.7 sigs.k8s.io/structured-merge-diff/v4 v4.4.1 sigs.k8s.io/yaml v1.3.0 ) diff --git a/go.sum b/go.sum index 5515ec227fee5..4180e143eb4aa 100644 --- a/go.sum +++ b/go.sum @@ -2704,8 +2704,8 @@ rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8 rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= -sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92KcwQA= -sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0= +sigs.k8s.io/controller-runtime v0.14.7 h1:Vrnm2vk9ZFlRkXATHz0W0wXcqNl7kPat8q2JyxVy0Q8= +sigs.k8s.io/controller-runtime v0.14.7/go.mod h1:ErTs3SJCOujNUnTz4AS+uh8hp6DHMo1gj6fFndJT1X8= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=