Skip to content

Commit

Permalink
chore: add labels to metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
Suraiya-Hameed committed Feb 22, 2024
1 parent 608fae1 commit 7eb8f38
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 95 deletions.
88 changes: 44 additions & 44 deletions docs/book/src/topics/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,15 @@ Prometheus is the only exporter that's currently supported with the driver.

## List of metrics provided by the driver

| Metric | Description | Tags |
| ------------------------------- | ------------------------------------------------------------------------- | --------------------------------------------------------------------------------- |
| total_node_publish | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>` |
| total_node_unpublish | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| total_node_publish_error | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>` |
| total_node_unpublish_error | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| total_sync_k8s_secret | Total number of k8s secrets synced | `os_type=<runtime os>`<br>`provider=<provider name>` |
| sync_k8s_secret_duration_sec | Distribution of how long it took to sync k8s secret | `os_type=<runtime os>` |
| total_rotation_reconcile | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>` |
| total_rotation_reconcile_error | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>` |
| Metric | Description | Tags |
|---------------------------------|---------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| node_publish_total | Total number of successful volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_total | Total number of successful volume unmount requests | `os_type=<runtime os>` |
| node_publish_error_total | Total number of errors with volume mount requests | `os_type=<runtime os>`<br>`provider=<provider name>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| node_unpublish_error_total | Total number of errors with volume unmount requests | `os_type=<runtime os>` |
| rotation_reconcile_total | Total number of rotation reconciles | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_error_total | Total number of rotation reconciles with error | `os_type=<runtime os>`<br>`rotated=<true or false>`<br>`error_type=<error code>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |
| rotation_reconcile_duration_sec | Distribution of how long it took to rotate secrets-store content for pods | `os_type=<runtime os>`<br>`pod_name=<pod_name>`<br>`pod_namespace=<pod_namespace>`<br>`secret_provider_class=<secret_provider_class>` |

Metrics are served from port 8095, but this port is not exposed outside the pod by default. Use kubectl port-forward to access the metrics over localhost:

Expand All @@ -28,36 +26,38 @@ curl localhost:8095/metrics
### Sample Metrics output

```shell
# HELP sync_k8s_secret_duration_sec Distribution of how long it took to sync k8s secret
# TYPE sync_k8s_secret_duration_sec histogram
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="0.1"} 0
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="0.2"} 0
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="0.3"} 0
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="0.4"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="0.5"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="1"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="1.5"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="2"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="2.5"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="3"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="5"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="10"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="15"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="30"} 1
sync_k8s_secret_duration_sec_bucket{os_type="linux",le="+Inf"} 1
sync_k8s_secret_duration_sec_sum{os_type="linux"} 0.3115892
sync_k8s_secret_duration_sec_count{os_type="linux"} 1
# HELP total_node_publish Total number of node publish calls
# TYPE total_node_publish counter
total_node_publish{os_type="linux",provider="azure"} 1
# HELP total_node_publish_error Total number of node publish calls with error
# TYPE total_node_publish_error counter
total_node_publish_error{error_type="ProviderBinaryNotFound",os_type="linux",provider="azure"} 2
total_node_publish_error{error_type="SecretProviderClassNotFound",os_type="linux",provider=""} 4
# HELP total_node_unpublish Total number of node unpublish calls
# TYPE total_node_unpublish counter
total_node_unpublish{os_type="linux"} 1
# HELP total_sync_k8s_secret Total number of k8s secrets synced
# TYPE total_sync_k8s_secret counter
total_sync_k8s_secret{os_type="linux",provider="azure"} 1
```
# HELP rotation_reconcile_duration_sec Distribution of how long it took to rotate secrets-store content for pods
# TYPE rotation_reconcile_duration_sec histogram
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.1"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.2"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.3"} 0
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.4"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="0.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="1.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="2.5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="3"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="5"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="10"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="15"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="30"} 1
rotation_reconcile_duration_sec_bucket{os_type="linux",le="+Inf"} 1
rotation_reconcile_duration_sec_sum{os_type="linux",} 0.3115892
rotation_reconcile_duration_sec_count{os_type="linux"} 1
# HELP rotation_reconcile_total Total number of rotation reconciles
# TYPE rotation_reconcile_total counter
rotation_reconcile_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 1
# HELP rotation_reconcile_error_total Total number of rotation reconciles with error
# TYPE rotation_reconcile_error_total counter
rotation_reconcile_error_total{error_type="GRPCProviderError",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",rotated="false",secret_provider_class="csi-test-spc"} 12
# HELP node_publish_total Total number of node publish calls
# TYPE node_publish_total counter
node_publish_total{os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 1
# HELP node_publish_error_total Total number of node publish calls with error
# TYPE node_publish_error_total counter
node_publish_error_total{error_type="ProviderBinaryNotFound",os_type="linux",pod_name="csi-test-app-wcsxk",pod_namespace="csi-test-secret-ns",provider="azure",secret_provider_class="csi-test-spc"} 7
# HELP node_unpublish_total Total number of node unpublish calls
# TYPE node_unpublish_total counter
node_unpublish_total{os_type="linux"} 1
```
7 changes: 5 additions & 2 deletions pkg/rotation/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,16 @@ func (r *Reconciler) reconcile(ctx context.Context, spcps *secretsstorev1.Secret
// after the provider mount request is complete
var requiresUpdate bool
var providerName string
podName := spcps.Status.PodName
podNamespace := spcps.Namespace
secretProviderClass := spcps.Status.SecretProviderClassName

defer func() {
if err != nil {
r.reporter.reportRotationErrorCtMetric(ctx, providerName, errorReason, requiresUpdate)
r.reporter.reportRotationErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason, requiresUpdate)
return
}
r.reporter.reportRotationCtMetric(ctx, providerName, requiresUpdate)
r.reporter.reportRotationCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, requiresUpdate)
r.reporter.reportRotationDuration(ctx, time.Since(begin).Seconds())
}()

Expand Down
27 changes: 18 additions & 9 deletions pkg/rotation/stats_reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,14 @@ const (
)

var (
providerKey = "provider"
errorKey = "error_type"
osTypeKey = "os_type"
rotatedKey = "rotated"
runtimeOS = runtime.GOOS
providerKey = "provider"
errorKey = "error_type"
osTypeKey = "os_type"
rotatedKey = "rotated"
runtimeOS = runtime.GOOS
podNameKey = "pod_name"
podNamespaceKey = "pod_namespace"
spcKey = "secret_provider_class"
)

type reporter struct {
Expand All @@ -44,8 +47,8 @@ type reporter struct {
}

type StatsReporter interface {
reportRotationCtMetric(ctx context.Context, provider string, wasRotated bool)
reportRotationErrorCtMetric(ctx context.Context, provider, errType string, wasRotated bool)
reportRotationCtMetric(ctx context.Context, provider, podName, podNamespace, spc string, wasRotated bool)
reportRotationErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string, wasRotated bool)
reportRotationDuration(ctx context.Context, duration float64)
}

Expand All @@ -67,21 +70,27 @@ func newStatsReporter() (StatsReporter, error) {
return r, nil
}

func (r *reporter) reportRotationCtMetric(ctx context.Context, provider string, wasRotated bool) {
func (r *reporter) reportRotationCtMetric(ctx context.Context, provider, podName, podNamespace, spc string, wasRotated bool) {
opt := metric.WithAttributes(
attribute.Key(providerKey).String(provider),
attribute.Key(osTypeKey).String(runtimeOS),
attribute.Key(rotatedKey).Bool(wasRotated),
attribute.Key(podNameKey).String(podName),
attribute.Key(podNamespaceKey).String(podNamespace),
attribute.Key(spcKey).String(spc),
)
r.rotationReconcileTotal.Add(ctx, 1, opt)
}

func (r *reporter) reportRotationErrorCtMetric(ctx context.Context, provider, errType string, wasRotated bool) {
func (r *reporter) reportRotationErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string, wasRotated bool) {
opt := metric.WithAttributes(
attribute.Key(providerKey).String(provider),
attribute.Key(errorKey).String(errType),
attribute.Key(osTypeKey).String(runtimeOS),
attribute.Key(rotatedKey).Bool(wasRotated),
attribute.Key(podNameKey).String(podName),
attribute.Key(podNamespaceKey).String(podNamespace),
attribute.Key(spcKey).String(spc),
)
r.rotationReconcileErrorTotal.Add(ctx, 1, opt)
}
Expand Down
36 changes: 32 additions & 4 deletions pkg/secrets-store/mocks/stats_reporter_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,60 @@ package mocks // import sigs.k8s.io/secrets-store-csi-driver/pkg/secrets-store/m

import "context"

type MetricDetails struct {
Provider string
PodName string
PodNamespace string
Spc string
ErrorType string
}

type FakeReporter struct {
reportNodePublishCtMetricInvoked int
reportNodeUnPublishCtMetricInvoked int
reportNodePublishErrorCtMetricInvoked int
reportNodeUnPublishErrorCtMetricInvoked int
reportSyncK8SecretCtMetricInvoked int
reportSyncK8SecretDurationInvoked int
metricDetails []MetricDetails
}

func NewFakeReporter() *FakeReporter {
return &FakeReporter{}
return &FakeReporter{
metricDetails: []MetricDetails{},
}
}

func (f *FakeReporter) ReportNodePublishCtMetric(ctx context.Context, provider string) {
func (f *FakeReporter) ReportNodePublishCtMetric(ctx context.Context, provider, podName, podNamespace, spc string) {
f.reportNodePublishCtMetricInvoked++
f.metricDetails = append(f.metricDetails, MetricDetails{
Provider: provider,
PodName: podName,
PodNamespace: podNamespace,
Spc: spc,
})
}

func (f *FakeReporter) ReportNodeUnPublishCtMetric(ctx context.Context) {
f.reportNodeUnPublishCtMetricInvoked++
}

func (f *FakeReporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, errType string) {
func (f *FakeReporter) ReportNodePublishErrorCtMetric(ctx context.Context, provider, podName, podNamespace, spc, errType string) {
f.reportNodePublishErrorCtMetricInvoked++
f.metricDetails = append(f.metricDetails, MetricDetails{
Provider: provider,
PodName: podName,
PodNamespace: podNamespace,
Spc: spc,
ErrorType: errType,
})
}

func (f *FakeReporter) ReportNodeUnPublishErrorCtMetric(ctx context.Context) {
f.reportNodeUnPublishErrorCtMetricInvoked++
}

func (f *FakeReporter) ReportSyncK8SecretCtMetric(ctx context.Context, provider string, count int) {
func (f *FakeReporter) ReportSyncK8SecretCtMetric(ctx context.Context, provider, podName, podNamespace, spc string, count int) {
f.reportSyncK8SecretCtMetricInvoked++
}

Expand All @@ -73,3 +97,7 @@ func (f *FakeReporter) ReportSyncK8SecretCtMetricInvoked() int {
func (f *FakeReporter) ReportSyncK8SecretDurationInvoked() int {
return f.reportSyncK8SecretDurationInvoked
}

func (f *FakeReporter) GetMetricDetails() []MetricDetails {
return f.metricDetails
}
8 changes: 4 additions & 4 deletions pkg/secrets-store/nodeserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
startTime := time.Now()
var parameters map[string]string
var providerName string
var podName, podNamespace, podUID, serviceAccountName string
var podName, podNamespace, podUID, serviceAccountName, secretProviderClass string
var targetPath string
var mounted bool
errorReason := internalerrors.FailedToMount
Expand All @@ -89,10 +89,10 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
klog.ErrorS(unmountErr, "failed to unmounting target path")
}
}
ns.reporter.ReportNodePublishErrorCtMetric(ctx, providerName, errorReason)
ns.reporter.ReportNodePublishErrorCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass, errorReason)
return
}
ns.reporter.ReportNodePublishCtMetric(ctx, providerName)
ns.reporter.ReportNodePublishCtMetric(ctx, providerName, podName, podNamespace, secretProviderClass)
}()

// Check arguments
Expand All @@ -115,7 +115,7 @@ func (ns *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis
mountFlags := req.GetVolumeCapability().GetMount().GetMountFlags()
secrets := req.GetSecrets()

secretProviderClass := attrib[secretProviderClassField]
secretProviderClass = attrib[secretProviderClassField]
providerName = attrib["providerName"]
podName = attrib[CSIPodName]
podNamespace = attrib[CSIPodNamespace]
Expand Down
Loading

0 comments on commit 7eb8f38

Please sign in to comment.