cert-manager · cert-manager-prow · Jun 28, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 27, 2024
diff --git a/cmd/app/app.go b/cmd/app/app.go
@@ -23,14 +23,18 @@ import (
 	"crypto/x509"
 	"encoding/pem"
 	"fmt"
+	"net/http"
 
 	"github.com/cert-manager/csi-lib/driver"
 	"github.com/cert-manager/csi-lib/manager"
 	"github.com/cert-manager/csi-lib/manager/util"
 	"github.com/cert-manager/csi-lib/metadata"
 	"github.com/cert-manager/csi-lib/storage"
 	"github.com/spf13/cobra"
+	"golang.org/x/sync/errgroup"
 	"k8s.io/utils/clock"
+	ctrl "sigs.k8s.io/controller-runtime"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 
 	"github.com/cert-manager/csi-driver/cmd/app/options"
 	"github.com/cert-manager/csi-driver/internal/version"
@@ -57,8 +61,11 @@ func NewCommand(ctx context.Context) *cobra.Command {
 		},
 		RunE: func(cmd *cobra.Command, args []string) error {
 			log := opts.Logr.WithName("main")
-			log.Info("Starting driver", "version", version.VersionInfo())
+			// Set the controller-runtime logger so that we get the
+			// controller-runtime metricsserver logs.
+			ctrl.SetLogger(log)
 
+			log.Info("Starting driver", "version", version.VersionInfo())
 			store, err := storage.NewFilesystem(opts.Logr.WithName("storage"), opts.DataRoot)
 			if err != nil {
 				return fmt.Errorf("failed to setup filesystem: %w", err)
@@ -96,18 +103,68 @@ func NewCommand(ctx context.Context) *cobra.Command {
 				return fmt.Errorf("failed to setup driver: " + err.Error())
 			}
 
-			go func() {
+			g, gCTX := errgroup.WithContext(ctx)
+			g.Go(func() error {
 				<-ctx.Done()
 				log.Info("shutting down driver", "context", ctx.Err())
 				d.Stop()
-			}()
+				return nil
+			})
 
-			log.Info("running driver")
-			if err := d.Run(); err != nil {
-				return fmt.Errorf("failed running driver: " + err.Error())
-			}
+			g.Go(func() error {
+				log.Info("running driver")
+				if err := d.Run(); err != nil {
+					return fmt.Errorf("failed running driver: " + err.Error())
+				}
+				return nil
+			})
 
-			return nil
+			// Start a metrics server if the --metrics-bind-address is not "0".
+			//
+			// By default this will serve all the metrics that are registered by
+			// controller-runtime to its global metrics registry. Including:
+			// * Go Runtime metrics
+			// * Process metrics
+			// * Various controller-runtime controller metrics
+			//   (not updated by csi-driver because it doesn't use controller-runtime)
+			// * Leader election metrics
+			//   (not updated by csi-driver because it doesn't use leader-election)
+			//
+			// The full list is here:
+			// https://github.com/kubernetes-sigs/controller-runtime/blob/700befecdffa803d19830a6a43adc5779ed01e26/pkg/internal/controller/metrics/metrics.go#L73-L86
+			//
+			// The advantages of using the controller-runtime metricsserver are:
+			// * It already exists and is actively maintained.
+			// * Provides optional features for securing the metrics endpoint by
+			//   TLS and by authentication with a K8S service account token,
+			//   should that be requested by users in the future.
+			// * Consistency with cert-manager/approver-policy, which also uses
+			//   this library and therefore publishes the same set of
+			//   controller-runtime base metrics.
+			// Disadvantages:
+			// * It introduces a dependency on controller-runtime, which often
+			//   introduces breaking changes.
+			// * It uses a global metrics registry, which has the usual risks
+			//   associated with globals and makes it difficult for us to control
+			//   which metrics are published for csi-driver.
+			//   https://github.com/kubernetes-sigs/controller-runtime/issues/210
+			var unusedHttpClient *http.Client
+			metricsServer, err := metricsserver.NewServer(
+				metricsserver.Options{
+					BindAddress: opts.MetricsBindAddress,
+				},
+				opts.RestConfig,
+				unusedHttpClient,
+			)
+			if err != nil {
+				return err
+			}
+			if metricsServer != nil {
+				g.Go(func() error {
+					return metricsServer.Start(gCTX)
+				})
+			}
+			return g.Wait()
 		},
 	}
 

diff --git a/cmd/app/options/options.go b/cmd/app/options/options.go
@@ -69,6 +69,11 @@ type Options struct {
 
 	// CMClient is a rest client for interacting with cert-manager resources.
 	CMClient cmclient.Interface
+
+	// MetricsBindAddress is the TCP address for exposing HTTP Prometheus metrics
+	// which will be served on the HTTP path '/metrics'. The value "0" will
+	// disable exposing metrics.
+	MetricsBindAddress string
 }
 
 func New() *Options {
@@ -152,4 +157,7 @@ func (o *Options) addAppFlags(fs *pflag.FlagSet) {
 
 	fs.BoolVar(&o.UseTokenRequest, "use-token-request", false,
 		"Use the empty audience token request for creating CertificateRequests. Requires the token request to be defined on the CSIDriver manifest.")
+	fs.StringVar(&o.MetricsBindAddress, "metrics-bind-address", "0",
+		"TCP address for exposing HTTP Prometheus metrics which will be served on the HTTP path '/metrics'. "+
+			`The value "0" will disable exposing metrics.`)
 }
diff --git a/deploy/charts/csi-driver/README.md b/deploy/charts/csi-driver/README.md
@@ -6,6 +6,95 @@
 
 <!-- AUTO-GENERATED -->
 
+#### **metrics.enabled** ~ `bool`
+> Default value:
+> ```yaml
+> true
+> ```
+
+Enable the metrics server on csi-driver pods.  
+If false, the metrics server will be disabled and the other metrics fields below will be ignored.
+#### **metrics.port** ~ `number`
+> Default value:
+> ```yaml
+> 9402
+> ```
+
+The TCP port on which the metrics server will listen.
+#### **metrics.podmonitor.enabled** ~ `bool`
+> Default value:
+> ```yaml
+> false
+> ```
+
+Create a PodMonitor to add csi-driver to Prometheus if you are using Prometheus Operator. See https://prometheus-operator.dev/docs/operator/api/#monitoring.coreos.com/v1.PodMonitor
+#### **metrics.podmonitor.namespace** ~ `string`
+
+The namespace that the pod monitor should live in, defaults to the cert-manager-csi-driver namespace.
+
+#### **metrics.podmonitor.prometheusInstance** ~ `string`
+> Default value:
+> ```yaml
+> default
+> ```
+
+Specifies the `prometheus` label on the created PodMonitor. This is used when different Prometheus instances have label selectors matching different PodMonitors.
+#### **metrics.podmonitor.interval** ~ `string`
+> Default value:
+> ```yaml
+> 60s
+> ```
+
+The interval to scrape metrics.
+#### **metrics.podmonitor.scrapeTimeout** ~ `string`
+> Default value:
+> ```yaml
+> 30s
+> ```
+
+The timeout before a metrics scrape fails.
+#### **metrics.podmonitor.labels** ~ `object`
+> Default value:
+> ```yaml
+> {}
+> ```
+
+Additional labels to add to the PodMonitor.
+#### **metrics.podmonitor.annotations** ~ `object`
+> Default value:
+> ```yaml
+> {}
+> ```
+
+Additional annotations to add to the PodMonitor.
+#### **metrics.podmonitor.honorLabels** ~ `bool`
+> Default value:
+> ```yaml
+> false
+> ```
+
+Keep labels from scraped data, overriding server-side labels.
+#### **metrics.podmonitor.endpointAdditionalProperties** ~ `object`
+> Default value:
+> ```yaml
+> {}
+> ```
+
+EndpointAdditionalProperties allows setting additional properties on the endpoint such as relabelings, metricRelabelings etc.  
+
+For example:
+
+```yaml
+endpointAdditionalProperties:
+ relabelings:
+ - action: replace
+   sourceLabels:
+   - __meta_kubernetes_pod_node_name
+   targetLabel: instance
+```
+
+
+
 #### **image.registry** ~ `string`
 
 Target image registry. This value is prepended to the target image repository, if set.  

diff --git a/deploy/charts/csi-driver/templates/daemonset.yaml b/deploy/charts/csi-driver/templates/daemonset.yaml
@@ -84,6 +84,11 @@ spec:
             - --endpoint=$(CSI_ENDPOINT)
             - --data-root=csi-data-dir
             - --use-token-request={{ .Values.app.driver.useTokenRequest }}
+{{- if .Values.metrics.enabled }}
+            - --metrics-bind-address=:{{ .Values.metrics.port }}
+{{- else }}
+            - --metrics-bind-address=0
+{{- end }}
           env:
             - name: NODE_ID
               valueFrom:
@@ -103,6 +108,10 @@ spec:
           ports:
             - containerPort: {{.Values.app.livenessProbe.port}}
               name: healthz
+{{- if .Values.metrics.enabled }}
+            - containerPort: {{ .Values.metrics.port }}
+              name: http-metrics
+{{- end }}
           livenessProbe:
             httpGet:
               path: /healthz

diff --git a/deploy/charts/csi-driver/templates/podmonitor.yaml b/deploy/charts/csi-driver/templates/podmonitor.yaml
@@ -0,0 +1,41 @@
+{{- if and .Values.metrics.enabled .Values.metrics.podmonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: {{ include "cert-manager-csi-driver.name" . }}
+{{- if .Values.metrics.podmonitor.namespace }}
+  namespace: {{ .Values.metrics.podmonitor.namespace }}
+{{- else }}
+  namespace: {{ .Release.Namespace | quote }}
+{{- end }}
+  labels:
+    {{- include "cert-manager-csi-driver.labels" . | nindent 4 }}
+    prometheus: {{ .Values.metrics.podmonitor.prometheusInstance }}
+    {{- with .Values.metrics.podmonitor.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+{{- with .Values.metrics.podmonitor.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+{{- end }}
+spec:
+  jobLabel: {{ include "cert-manager-csi-driver.name" . }}
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "cert-manager-csi-driver.name" . }}
+      app.kubernetes.io/instance: {{ .Release.Name }}
+{{- if .Values.metrics.podmonitor.namespace }}
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace | quote }}
+{{- end }}
+  podMetricsEndpoints:
+    - port: http-metrics
+      path: /metrics
+      interval: {{ .Values.metrics.podmonitor.interval }}
+      scrapeTimeout: {{ .Values.metrics.podmonitor.scrapeTimeout }}
+      honorLabels: {{ .Values.metrics.podmonitor.honorLabels }}
+      {{- with .Values.metrics.podmonitor.endpointAdditionalProperties }}
+      {{- toYaml . | nindent 4 }}
+      {{- end }}
+{{- end }}