Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add prometheus metric label "k8sgpt" #364

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 83 additions & 34 deletions controllers/k8sgpt_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,30 @@ const (
var (
// Metrics
// k8sgptReconcileErrorCount is a metric for the number of errors during reconcile
k8sgptReconcileErrorCount = prometheus.NewCounter(prometheus.CounterOpts{
k8sgptReconcileErrorCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_reconcile_error_count",
Help: "The total number of errors during reconcile",
})
}, []string{"k8sgpt"})
// k8sgptNumberOfResults is a metric for the number of results
k8sgptNumberOfResults = prometheus.NewGauge(prometheus.GaugeOpts{
k8sgptNumberOfResults = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "k8sgpt_number_of_results",
Help: "The total number of results",
})
}, []string{"k8sgpt"})
// k8sgptNumberOfResultsByType is a metric for the number of results by type
k8sgptNumberOfResultsByType = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "k8sgpt_number_of_results_by_type",
Help: "The total number of results by type",
}, []string{"kind", "name"})
}, []string{"kind", "name", "k8sgpt"})
// k8sgptNumberOfBackendAICalls is a metric for the number of backend AI calls
k8sgptNumberOfBackendAICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_number_of_backend_ai_calls",
Help: "The total number of backend AI calls",
}, []string{"backend", "deployment", "namespace"})
}, []string{"backend", "deployment", "namespace", "k8sgpt"})
// k8sNumberOfFailedBackendAICalls is a metric for the number of failed backend AI calls
k8sgptNumberOfFailedBackendAICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_number_of_failed_backend_ai_calls",
Help: "The total number of failed backend AI calls",
}, []string{"backend", "deployment", "namespace"})
}, []string{"backend", "deployment", "namespace", "k8sgpt"})
// analysisRetryCount is for the number of analysis failures
analysisRetryCount int
// allowBackendAIRequest a circuit breaker that switching on/off backend AI calls
Expand Down Expand Up @@ -102,7 +102,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
err := r.Get(ctx, req.NamespacedName, k8sgptConfig)
if err != nil {
// Error reading the object - requeue the request.
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return ctrl.Result{}, client.IgnoreNotFound(err)
}

Expand All @@ -114,7 +116,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if !utils.ContainsString(k8sgptConfig.GetFinalizers(), FinalizerName) {
controllerutil.AddFinalizer(k8sgptConfig, FinalizerName)
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -125,12 +129,16 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// Delete any external resources associated with the instance
err := resources.Sync(ctx, r.Client, *k8sgptConfig, resources.DestroyOp)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
controllerutil.RemoveFinalizer(k8sgptConfig, FinalizerName)
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -144,7 +152,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
MaxRetries: 5,
}
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -154,12 +164,16 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace,
Name: k8sgptConfig.Name}, &deployment)
if client.IgnoreNotFound(err) != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
err = resources.Sync(ctx, r.Client, *k8sgptConfig, resources.SyncOp)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -179,7 +193,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
imageRepository, k8sgptConfig.Spec.Version)
err = r.Update(ctx, &deployment)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -189,15 +205,19 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// If the deployment is active, we will query it directly for sis data
address, err := kclient.GenerateAddress(ctx, r.Client, k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Log address
fmt.Printf("K8sGPT address: %s\n", address)

k8sgptClient, err := kclient.NewClient(address)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -207,14 +227,18 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if k8sgptConfig.Spec.RemoteCache != nil {
err = k8sgptClient.AddConfig(k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
if k8sgptConfig.Spec.Integrations != nil {
err = k8sgptClient.AddIntegration(k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -225,7 +249,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{
"backend": k8sgptConfig.Spec.AI.Backend,
"deployment": deployment.Name,
"namespace": deployment.Namespace}).Inc()
"namespace": deployment.Namespace,
"k8sgpt": k8sgptConfig.Name,
}).Inc()

if k8sgptConfig.Spec.AI.BackOff.Enabled {
if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries {
Expand All @@ -236,7 +262,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
analysisRetryCount++
}
}
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Reset analysisRetryCount
Expand All @@ -247,14 +275,21 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
k8sgptNumberOfBackendAICalls.With(prometheus.Labels{
"backend": k8sgptConfig.Spec.AI.Backend,
"deployment": deployment.Name,
"namespace": deployment.Namespace}).Inc()
"namespace": deployment.Namespace,
"k8sgpt": k8sgptConfig.Name,
}).Inc()
}

// Parse the k8sgpt-deployment response into a list of results
k8sgptNumberOfResults.Set(float64(len(response.Results)))
k8sgptNumberOfResults.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Set(float64(len(response.Results)))

rawResults, err := resources.MapResults(*r.Integrations, response.Results, *k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Prior to creating or updating any results we will delete any stale results that
Expand All @@ -266,7 +301,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
"k8sgpts.k8sgpt.ai/namespace": k8sgptConfig.Namespace,
}))
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
if len(resultList.Items) > 0 {
Expand All @@ -276,12 +313,15 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if _, ok := rawResults[result.Name]; !ok {
err = r.Delete(ctx, &result)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
} else {
k8sgptNumberOfResultsByType.With(prometheus.Labels{
"kind": result.Spec.Kind,
"name": result.Name,
"kind": result.Spec.Kind,
"name": result.Name,
"k8sgpt": k8sgptConfig.Name,
}).Dec()
}
}
Expand All @@ -292,15 +332,18 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
for _, result := range rawResults {
operation, err := resources.CreateOrUpdateResult(ctx, r.Client, result)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)

}
// Update metrics
if operation == resources.CreatedResult {
k8sgptNumberOfResultsByType.With(prometheus.Labels{
"kind": result.Spec.Kind,
"name": result.Name,
"kind": result.Spec.Kind,
"name": result.Name,
"k8sgpt": k8sgptConfig.Name,
}).Inc()
} else if operation == resources.UpdatedResult {
fmt.Printf("Updated successfully %s \n", result.Name)
Expand Down Expand Up @@ -334,7 +377,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
Name: k8sgptConfig.Spec.Sink.Secret.Name,
}
if err := r.Get(ctx, secretNamespacedName, secret); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(fmt.Errorf("could not find sink secret: %w", err), false)
}

Expand All @@ -353,7 +398,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if sinkEnabled {
if res.Status.LifeCycle != string(resources.NoOpResult) || res.Status.Webhook == "" {
if err := sinkType.Emit(res.Spec); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
res.Status.Webhook = k8sgptConfig.Spec.Sink.Endpoint
Expand All @@ -363,7 +410,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
res.Status.Webhook = ""
}
if err := r.Status().Update(ctx, &res); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand Down
2 changes: 1 addition & 1 deletion grafana/custom-metrics/custom-metrics-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": true,
"expr": "sum(rate(k8sgpt_reconcile_error_count{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"expr": "sum(rate(k8sgpt_reconcile_error_count{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod, k8sgpt)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down
8 changes: 4 additions & 4 deletions grafana/k8sgpt-overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum by (kind) (k8sgpt_number_of_results_by_type)",
"expr": "sum by (kind, k8sgpt) (k8sgpt_number_of_results_by_type)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -192,7 +192,7 @@
"uid": "prometheus"
},
"editorMode": "builder",
"expr": "sum(k8sgpt_number_of_results)",
"expr": "sum by (k8sgpt) (k8sgpt_number_of_results)",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -279,7 +279,7 @@
"uid": "prometheus"
},
"editorMode": "builder",
"expr": "count by(kind) (k8sgpt_number_of_results_by_type)",
"expr": "count by(kind, k8sgpt) (k8sgpt_number_of_results_by_type)",
"hide": false,
"range": true,
"refId": "A"
Expand Down Expand Up @@ -701,4 +701,4 @@
"uid": "U82QyO8Vz",
"version": 8,
"weekStart": ""
}
}