From 78b46cc2e2bb6a894c6949bed20c56b8bb39e921 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 28 Feb 2023 22:59:04 -0800 Subject: [PATCH] Merge metrics to fire singleton metrics to controller-runtime namespace --- pkg/controllers/metrics/pod/controller.go | 2 +- .../metrics/provisioner/controller.go | 2 +- pkg/controllers/metrics/state/controller.go | 2 +- pkg/controllers/provisioning/controller.go | 2 +- pkg/controllers/provisioning/provisioner.go | 2 +- pkg/controllers/state/informer/node.go | 2 +- pkg/controllers/state/informer/pod.go | 2 +- pkg/controllers/state/informer/provisioner.go | 2 +- pkg/operator/controller/singleton.go | 119 ++++++++++++------ 9 files changed, 92 insertions(+), 43 deletions(-) diff --git a/pkg/controllers/metrics/pod/controller.go b/pkg/controllers/metrics/pod/controller.go index b40b7849e2..87b81f3a2e 100644 --- a/pkg/controllers/metrics/pod/controller.go +++ b/pkg/controllers/metrics/pod/controller.go @@ -112,7 +112,7 @@ func NewController(kubeClient client.Client) controller.Controller { } func (c *Controller) Name() string { - return "podmetrics" + return "pod_metrics" } // Reconcile executes a termination control loop for the resource diff --git a/pkg/controllers/metrics/provisioner/controller.go b/pkg/controllers/metrics/provisioner/controller.go index 94fd56ac82..9f6254c2de 100644 --- a/pkg/controllers/metrics/provisioner/controller.go +++ b/pkg/controllers/metrics/provisioner/controller.go @@ -101,7 +101,7 @@ func NewController(kubeClient client.Client) corecontroller.Controller { } func (c *Controller) Name() string { - return "provisionermetrics" + return "provisioner_metrics" } // Reconcile executes a termination control loop for the resource diff --git a/pkg/controllers/metrics/state/controller.go b/pkg/controllers/metrics/state/controller.go index 2b2ba5ceca..743806305e 100644 --- a/pkg/controllers/metrics/state/controller.go +++ b/pkg/controllers/metrics/state/controller.go @@ -41,7 +41,7 @@ func NewController(cluster *state.Cluster) *Controller { } func (c *Controller) Name() string { - return "metricscraper" + return "metric_scraper" } func (c *Controller) Builder(_ context.Context, mgr manager.Manager) controller.Builder { diff --git a/pkg/controllers/provisioning/controller.go b/pkg/controllers/provisioning/controller.go index 2c7fe247c0..72efb34407 100644 --- a/pkg/controllers/provisioning/controller.go +++ b/pkg/controllers/provisioning/controller.go @@ -49,7 +49,7 @@ func NewController(kubeClient client.Client, provisioner *Provisioner, recorder } func (c *Controller) Name() string { - return "provisioning" + return "provisioner_trigger" } // Reconcile the resource diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index f0ef1b3b73..0343c13f2f 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -448,7 +448,7 @@ func validateNodeSelectorTerm(term v1.NodeSelectorTerm) (errs error) { var schedulingDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: metrics.Namespace, - Subsystem: "allocation_controller", + Subsystem: "provisioner", Name: "scheduling_duration_seconds", Help: "Duration of scheduling process in seconds. Broken down by provisioner and error.", Buckets: metrics.DurationBuckets(), diff --git a/pkg/controllers/state/informer/node.go b/pkg/controllers/state/informer/node.go index ebd0209b03..160f7a37d4 100644 --- a/pkg/controllers/state/informer/node.go +++ b/pkg/controllers/state/informer/node.go @@ -45,7 +45,7 @@ func NewNodeController(kubeClient client.Client, cluster *state.Cluster) corecon } func (c *NodeController) Name() string { - return "node-state" + return "node_state" } func (c *NodeController) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { diff --git a/pkg/controllers/state/informer/pod.go b/pkg/controllers/state/informer/pod.go index 387f16b0eb..dba60bcf51 100644 --- a/pkg/controllers/state/informer/pod.go +++ b/pkg/controllers/state/informer/pod.go @@ -47,7 +47,7 @@ func NewPodController(kubeClient client.Client, cluster *state.Cluster) corecont } func (c *PodController) Name() string { - return "pod-state" + return "pod_state" } func (c *PodController) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { diff --git a/pkg/controllers/state/informer/provisioner.go b/pkg/controllers/state/informer/provisioner.go index 13f21ffe0c..35a61025b5 100644 --- a/pkg/controllers/state/informer/provisioner.go +++ b/pkg/controllers/state/informer/provisioner.go @@ -46,7 +46,7 @@ func NewProvisionerController(kubeClient client.Client, cluster *state.Cluster) } func (c *ProvisionerController) Name() string { - return "provisionerstate" + return "provisioner_state" } func (c *ProvisionerController) Reconcile(_ context.Context, _ *v1alpha5.Provisioner) (reconcile.Result, error) { diff --git a/pkg/operator/controller/singleton.go b/pkg/operator/controller/singleton.go index cc3c5a1ec2..e4022632d6 100644 --- a/pkg/operator/controller/singleton.go +++ b/pkg/operator/controller/singleton.go @@ -16,11 +16,10 @@ package controller import ( "context" - "strings" + "errors" "time" "github.com/prometheus/client_golang/prometheus" - "github.com/samber/lo" "k8s.io/client-go/util/workqueue" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -47,45 +46,28 @@ func (b SingletonBuilder) Complete(r Reconciler) error { type Singleton struct { Reconciler - metrics *singletonMetrics rateLimiter ratelimiter.RateLimiter } -type singletonMetrics struct { - reconcileDuration prometheus.Histogram - reconcileErrors prometheus.Counter -} - func newSingleton(r Reconciler) *Singleton { - return &Singleton{ + s := &Singleton{ Reconciler: r, - metrics: newSingletonMetrics(r.Name()), rateLimiter: workqueue.DefaultItemBasedRateLimiter(), } + s.initMetrics() + return s } -func newSingletonMetrics(name string) *singletonMetrics { - metrics := &singletonMetrics{ - reconcileDuration: prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: metrics.Namespace, - Subsystem: strings.ReplaceAll(name, ".", "_"), - Name: "reconcile_time_seconds", - Help: "Length of time per reconcile.", - Buckets: metrics.DurationBuckets(), - }, - ), - reconcileErrors: prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: strings.ReplaceAll(name, ".", "_"), - Name: "reconcile_errors_total", - Help: "Total number of reconcile errors.", - }, - ), - } - crmetrics.Registry.MustRegister(metrics.reconcileDuration, metrics.reconcileErrors) - return metrics +// initMetrics is effectively the same metrics initialization function used by controller-runtime +// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/controller.go +func (s *Singleton) initMetrics() { + activeWorkers.WithLabelValues(s.Name()).Set(0) + reconcileErrors.WithLabelValues(s.Name()).Add(0) + reconcileTotal.WithLabelValues(s.Name(), labelError).Add(0) + reconcileTotal.WithLabelValues(s.Name(), labelRequeueAfter).Add(0) + reconcileTotal.WithLabelValues(s.Name(), labelRequeue).Add(0) + reconcileTotal.WithLabelValues(s.Name(), labelSuccess).Add(0) + workerCount.WithLabelValues(s.Name()).Set(float64(1)) } var singletonRequest = reconcile.Request{} @@ -105,23 +87,90 @@ func (s *Singleton) Start(ctx context.Context) error { } func (s *Singleton) reconcile(ctx context.Context) time.Duration { - measureDuration := metrics.Measure(s.metrics.reconcileDuration) + activeWorkers.WithLabelValues(s.Name()).Inc() + defer activeWorkers.WithLabelValues(s.Name()).Dec() + + measureDuration := metrics.Measure(reconcileDuration.WithLabelValues(s.Name())) res, err := s.Reconcile(ctx, singletonRequest) measureDuration() // Observe the length of time between the function creation and now switch { case err != nil: - s.metrics.reconcileErrors.Inc() + reconcileErrors.WithLabelValues(s.Name()).Inc() + reconcileTotal.WithLabelValues(s.Name(), labelError).Inc() logging.FromContext(ctx).Error(err) return s.rateLimiter.When(singletonRequest) case res.Requeue: + reconcileTotal.WithLabelValues(s.Name(), labelRequeue).Inc() return s.rateLimiter.When(singletonRequest) default: s.rateLimiter.Forget(singletonRequest) - return lo.Ternary(res.RequeueAfter > 0, res.RequeueAfter, time.Duration(0)) + switch { + case res.RequeueAfter > 0: + reconcileTotal.WithLabelValues(s.Name(), labelRequeueAfter).Inc() + return res.RequeueAfter + default: + reconcileTotal.WithLabelValues(s.Name(), labelSuccess).Inc() + return time.Duration(0) + } } } func (s *Singleton) NeedLeaderElection() bool { return true } + +func init() { + mergeMetrics() +} + +const ( + labelError = "error" + labelRequeueAfter = "requeue_after" + labelRequeue = "requeue" + labelSuccess = "success" +) + +// Metrics below are copied metrics fired by controller-runtime in its /internal package. This is leveraged +// so that we can fire to the same namespace as users expect other controller-runtime metrics to be fired +// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/metrics/metrics.go +var ( + reconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + reconcileDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "controller_runtime_reconcile_time_seconds", + Help: "Length of time per reconciliation per controller", + Buckets: metrics.DurationBuckets(), + }, []string{"controller"}) + reconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_errors_total", + Help: "Total number of reconciliation errors per controller", + }, []string{"controller"}) + workerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_max_concurrent_reconciles", + Help: "Maximum number of concurrent reconciles per controller", + }, []string{"controller"}) + activeWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_active_workers", + Help: "Number of currently used workers per controller", + }, []string{"controller"}) +) + +// mergeMetrics merges the singletonMetrics with metrics already registered in the controller-runtime metrics registry +// https://github.com/kubernetes-sigs/controller-runtime/blob/main/pkg/internal/controller/metrics/metrics.go +// We know that all these metrics should be registered by controller-runtime so we should switch over +func mergeMetrics() { + err := &prometheus.AlreadyRegisteredError{} + errors.As(crmetrics.Registry.Register(reconcileTotal), err) + reconcileTotal = err.ExistingCollector.(*prometheus.CounterVec) + errors.As(crmetrics.Registry.Register(reconcileDuration), err) + reconcileDuration = err.ExistingCollector.(*prometheus.HistogramVec) + errors.As(crmetrics.Registry.Register(reconcileErrors), err) + reconcileErrors = err.ExistingCollector.(*prometheus.CounterVec) + errors.As(crmetrics.Registry.Register(workerCount), err) + workerCount = err.ExistingCollector.(*prometheus.GaugeVec) + errors.As(crmetrics.Registry.Register(activeWorkers), err) + activeWorkers = err.ExistingCollector.(*prometheus.GaugeVec) +}