Skip to content

Commit

Permalink
chore: add labels to metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
Suraiya-Hameed committed Mar 22, 2024
1 parent d308d13 commit 525f176
Show file tree
Hide file tree
Showing 10 changed files with 282 additions and 126 deletions.
12 changes: 12 additions & 0 deletions controllers/secretproviderclasspodstatus_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type SecretProviderClassPodStatusReconciler struct {
writer client.Writer
eventRecorder record.EventRecorder
driverName string
reporter StatsReporter
}

// New creates a new SecretProviderClassPodStatusReconciler
Expand All @@ -73,6 +74,10 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider
kubeClient := kubernetes.NewForConfigOrDie(mgr.GetConfig())
eventBroadcaster.StartRecordingToSink(&clientcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "csi-secrets-store-controller"})
sr, err := newStatsReporter()
if err != nil {
return nil, err
}

return &SecretProviderClassPodStatusReconciler{
Client: mgr.GetClient(),
Expand All @@ -83,6 +88,7 @@ func New(driverName string, mgr manager.Manager, nodeID string) (*SecretProvider
writer: mgr.GetClient(),
eventRecorder: recorder,
driverName: driverName,
reporter: sr,
}, nil
}

Expand Down Expand Up @@ -266,6 +272,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context,
return ctrl.Result{}, nil
}

// if SecretObjects defined in the SPC, record the time to report sync_k8s_secret_duration_sec metric
begin := time.Now()

// determine which pod volume this is associated with
podVol := k8sutil.SPCVolume(pod, r.driverName, spc.Name)
if podVol == nil {
Expand Down Expand Up @@ -365,6 +374,9 @@ func (r *SecretProviderClassPodStatusReconciler) Reconcile(ctx context.Context,
return ctrl.Result{Requeue: true}, nil
}

r.reporter.ReportSyncSecretCtMetric(ctx, string(spc.Spec.Provider), spcPodStatus.Namespace, spc.Name)
r.reporter.ReportSyncSecretDuration(ctx, time.Since(begin).Seconds())

klog.InfoS("reconcile complete", "spc", klog.KObj(spc), "pod", klog.KObj(pod), "spcps", klog.KObj(spcPodStatus))
// requeue the spc pod status again after 5mins to check if secret and ownerRef exists
// and haven't been modified. If secret doesn't exist, then this requeue will ensure it's
Expand Down
80 changes: 80 additions & 0 deletions controllers/stats_reporter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package controllers

import (
"context"
"runtime"

"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"go.opentelemetry.io/otel/metric/global"
)

const (
scope = "sigs.k8s.io/secrets-store-csi-driver"
)

var (
providerKey = "provider"
osTypeKey = "os_type"
runtimeOS = runtime.GOOS
namespaceKey = "namespace"
spcKey = "secret_provider_class"
)

type reporter struct {
syncK8sSecretTotal metric.Int64Counter
syncK8sSecretDuration metric.Float64Histogram
}

type StatsReporter interface {
ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string)
ReportSyncSecretDuration(ctx context.Context, duration float64)
}

func newStatsReporter() (StatsReporter, error) {
var err error

r := &reporter{}
meter := global.Meter(scope)

if r.syncK8sSecretTotal, err = meter.Int64Counter("sync_k8s_secret", metric.WithDescription("Total number of k8s secrets synced")); err != nil {
return nil, err
}
if r.syncK8sSecretDuration, err = meter.Float64Histogram("sync_k8s_secret_duration_sec", metric.WithDescription("Distribution of how long it took to sync k8s secret")); err != nil {
return nil, err
}
return r, nil
}

func (r reporter) ReportSyncSecretCtMetric(ctx context.Context, provider, namespace, spc string) {
opt := metric.WithAttributes(
attribute.Key(providerKey).String(provider),
attribute.Key(osTypeKey).String(runtimeOS),
attribute.Key(namespaceKey).String(namespace),
attribute.Key(spcKey).String(spc),
)
r.syncK8sSecretTotal.Add(ctx, 1, opt)
}

func (r reporter) ReportSyncSecretDuration(ctx context.Context, duration float64) {
opt := metric.WithAttributes(
attribute.Key(osTypeKey).String(runtimeOS),
)
r.syncK8sSecretDuration.Record(ctx, duration, opt)
}
76 changes: 51 additions & 25 deletions docs/book/src/topics/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@ Prometheus is the only exporter that's currently supported with the driver.

## List of metrics provided by the driver

| Metric | Description | Tags |
| ------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
| total_node_publish | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>` |
| total_node_unpublish | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| total_node_publish_error | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>` |
| total_node_unpublish_error | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| total_sync_k8s_secret | Total number of k8s secrets synced | `os_type=<runtime os>`<br>`provider=<provider name>` |
| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=<runtime os>` |
| total_rotation_reconcile | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>` |
| total_rotation_reconcile_error | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>` |
| Metric | Description | Tags |
|---------------------------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| node_publish_total | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_total | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| node_publish_error_total | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_error_total | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| sync_k8s_secret_total | Total number of k8s secrets synced | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`namespace=<namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=<runtime os>` |
| rotation_reconcile_total | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_error_total | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |

Metrics are served from port 8095, but this port is not exposed outside the pod by default. Use kubectl port-forward to access the metrics over localhost:

Expand Down Expand Up @@ -47,17 +47,43 @@ sync_k8s_secret_duration_sec_bucket{os_type="linux",le="30"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="+Inf"} 1
sync_k8s_secret_duration_sec_sum{os_type="linux"} 0.3115892
sync_k8s_secret_duration_sec_count{os_type="linux"} 1
# HELP total_node_publish Total number of node publish calls
# TYPE total_node_publish counter
total_node_publish{os_type="linux",provider="azure"} 1
# HELP total_node_publish_error Total number of node publish calls with error
# TYPE total_node_publish_error counter
total_node_publish_error{error_type="ProviderBinaryNotFound",os_type="linux",provider="azure"} 2
total_node_publish_error{error_type="SecretProviderClassNotFound",os_type="linux",provider=""} 4
# HELP total_node_unpublish Total number of node unpublish calls
# TYPE total_node_unpublish counter
total_node_unpublish{os_type="linux"} 1
# HELP total_sync_k8s_secret Total number of k8s secrets synced
# TYPE total_sync_k8s_secret counter
total_sync_k8s_secret{os_type="linux",provider="azure"} 1
```

# HELP sync_k8s_secret_total Total number of k8s secrets synced
# TYPE sync_k8s_secret_total counter
sync_k8s_secret_total{namespace="csi-test-secret-ns",os_type="linux",provider="azure",secret_provider_class="csi-test-spc"} 1

# HELP rotation_reconcile_duration_sec Distribution of how long it took to rotate secrets-store content for pods
# TYPE rotation_reconcile_duration_sec histogram
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.1"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.2"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.3"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.4"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="3"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="10"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="15"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="30"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="+Inf"} 1
rotation_reconcile_duration_sec_sum{os_type="linux",} 0.3115892
rotation_reconcile_duration_sec_count{os_type="linux"} 1
# HELP rotation_reconcile_total Total number of rotation reconciles
# TYPE rotation_reconcile_total counter
rotation_reconcile_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 1
# HELP rotation_reconcile_error_total Total number of rotation reconciles with error
# TYPE rotation_reconcile_error_total counter
rotation_reconcile_error_total{error_type="GRPCProviderError",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 12
# HELP node_publish_total Total number of node publish calls
# TYPE node_publish_total counter
node_publish_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 1
# HELP node_publish_error_total Total number of node publish calls with error
# TYPE node_publish_error_total counter
node_publish_error_total{error_type="ProviderBinaryNotFound",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 7
# HELP node_unpublish_total Total number of node unpublish calls
# TYPE node_unpublish_total counter
node_unpublish_total{os_type="linux"} 1
```
33 changes: 18 additions & 15 deletions pkg/rotation/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
// after the provider mount request is complete
var requiresUpdate bool
var providerName string
podName := spcps.Status.PodName
podNamespace := spcps.Namespace
secretProviderClass := spcps.Status.SecretProviderClassName

defer func() {
if err != nil {
r.reporter.reportRotationErrorCtMetric(ctx, providerName, errorReason, requiresUpdate)
r.reporter.reportRotationErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason, requiresUpdate)
return
}
r.reporter.reportRotationCtMetric(ctx, providerName, requiresUpdate)
r.reporter.reportRotationCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, requiresUpdate)
r.reporter.reportRotationDuration(ctx, time.Since(begin).Seconds())
}()

Expand All @@ -266,14 +269,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
err = r.cache.Get(
ctx,
client.ObjectKey{
Namespace: spcps.Namespace,
Name: spcps.Status.PodName,
Namespace: podNamespace,
Name: podName,
},
pod,
)
if err != nil {
errorReason = internalerrors.PodNotFound
return fmt.Errorf("failed to get pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err)
return fmt.Errorf("failed to get pod %s/%s, err: %w", podNamespace, podName, err)
}
// skip rotation if the pod is being terminated
// or the pod is in succeeded state (for jobs that complete aren't gc yet)
Expand All @@ -289,14 +292,14 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
err = r.cache.Get(
ctx,
client.ObjectKey{
Namespace: spcps.Namespace,
Name: spcps.Status.SecretProviderClassName,
Namespace: podNamespace,
Name: secretProviderClass,
},
spc,
)
if err != nil {
errorReason = internalerrors.SecretProviderClassNotFound
return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", spcps.Namespace, spcps.Status.SecretProviderClassName, err)
return fmt.Errorf("failed to get secret provider class %s/%s, err: %w", podNamespace, secretProviderClass, err)
}

// determine which pod volume this is associated with
Expand Down Expand Up @@ -359,16 +362,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
// This comprises the secret parameter in the MountRequest to the provider
if nodePublishSecretRef != nil {
// read secret from the informer cache
secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, spcps.Namespace)
secret, err := r.secretStore.GetNodePublishSecretRefSecret(nodePublishSecretRef.Name, podNamespace)
if err != nil {
if apierrors.IsNotFound(err) {
klog.ErrorS(err,
fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, spcps.Namespace),
"name", nodePublishSecretRef.Name, "namespace", spcps.Namespace)
fmt.Sprintf("nodePublishSecretRef not found. If the secret with name exists in namespace, label the secret by running 'kubectl label secret %s %s=true -n %s", nodePublishSecretRef.Name, controllers.SecretUsedLabel, podNamespace),
"name", nodePublishSecretRef.Name, "namespace", podNamespace)
}
errorReason = internalerrors.NodePublishSecretRefNotFound
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", spcps.Namespace, nodePublishSecretRef.Name, err))
return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", spcps.Namespace, nodePublishSecretRef.Name, err)
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("failed to get node publish secret %s/%s, err: %+v", podNamespace, nodePublishSecretRef.Name, err))
return fmt.Errorf("failed to get node publish secret %s/%s, err: %w", podNamespace, nodePublishSecretRef.Name, err)
}

for k, v := range secret.Data {
Expand Down Expand Up @@ -401,7 +404,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
newObjectVersions, errorReason, err := secretsstore.MountContent(ctx, providerClient, string(paramsJSON), string(secretsJSON), spcps.Status.TargetPath, string(permissionJSON), oldObjectVersions)
if err != nil {
r.generateEvent(pod, corev1.EventTypeWarning, mountRotationFailedReason, fmt.Sprintf("provider mount err: %+v", err))
return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", spcps.Namespace, spcps.Status.PodName, err)
return fmt.Errorf("failed to rotate objects for pod %s/%s, err: %w", podNamespace, podName, err)
}

// compare the old object versions and new object versions to check if any of the objects
Expand Down Expand Up @@ -488,7 +491,7 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret

patchFn := func() (bool, error) {
// patch secret data with the new contents
if err := r.patchSecret(ctx, secretObj.SecretName, spcps.Namespace, datamap); err != nil {
if err := r.patchSecret(ctx, secretObj.SecretName, podNamespace, datamap); err != nil {
// syncSecret.enabled is set to false by default in the helm chart for installing the driver in v0.0.23+
// that would result in a forbidden error, so generate a warning that can be helpful for debugging
if apierrors.IsForbidden(err) {
Expand Down
Loading

0 comments on commit 525f176

Please sign in to comment.