From 266eeee2e9372ef8060d88a6bc3521f308335a3b Mon Sep 17 00:00:00 2001 From: avlitman Date: Sun, 31 Dec 2023 14:21:30 +0200 Subject: [PATCH] Refactor recording-rules and alerts code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: avlitman Bump github.com/machadovilaca/operator-observability Signed-off-by: João Vilaça Update prometheus rules controller Signed-off-by: João Vilaça Update metrics docs Signed-off-by: João Vilaça Update hack/prom-rule-ci/rule-spec-dumper Signed-off-by: João Vilaça delete cluster_operator.go and fix tools location Signed-off-by: avlitman --- controllers/alerts/alerts.go | 195 ++---------------- controllers/alerts/metrics_test.go | 67 ++++-- controllers/alerts/reconciler.go | 10 +- .../hyperconverged_controller_test.go | 10 +- controllers/hyperconverged/testUtils_test.go | 7 +- docs/metrics.md | 3 + hack/prom-rule-ci/rule-spec-dumper.go | 27 ++- .../rules/alerts/operator_alerts.go | 70 +++++++ pkg/monitoring/rules/alerts/prometheus.go | 52 +++++ .../rules/recordingrules/operator.go | 26 +++ .../rules/recordingrules/recordingrules.go | 9 + pkg/monitoring/rules/rules.go | 52 +++++ tools/metricsdocs/metricsdocs.go | 12 +- .../metrics_json_generator.go | 22 +- 14 files changed, 348 insertions(+), 214 deletions(-) create mode 100644 pkg/monitoring/rules/alerts/operator_alerts.go create mode 100644 pkg/monitoring/rules/alerts/prometheus.go create mode 100644 pkg/monitoring/rules/recordingrules/operator.go create mode 100644 pkg/monitoring/rules/recordingrules/recordingrules.go create mode 100644 pkg/monitoring/rules/rules.go diff --git a/controllers/alerts/alerts.go b/controllers/alerts/alerts.go index 7cde00d79a..0ac3925b15 100644 --- a/controllers/alerts/alerts.go +++ b/controllers/alerts/alerts.go @@ -1,81 +1,47 @@ package alerts -// This package makes sure that the PrometheusRule is present with the right configurations. -// This code was taken out of the operator package, because the operand reconciliation is done -// only if the HyperConverged CR is present. But we need the alert in place even if the CR was -// not created. - import ( "context" - "errors" - "fmt" - "os" "reflect" - "strings" "github.com/go-logr/logr" - monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules" hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" ) const ( - alertRuleGroup = "kubevirt.hyperconverged.rules" - outOfBandUpdateAlert = "KubeVirtCRModified" - unsafeModificationAlert = "UnsupportedHCOModification" - installationNotCompletedAlert = "HCOInstallationIncomplete" - singleStackIPv6Alert = "SingleStackIPv6Unsupported" - severityAlertLabelKey = "severity" - healthImpactAlertLabelKey = "operator_health_impact" - partOfAlertLabelKey = "kubernetes_operator_part_of" - partOfAlertLabelValue = "kubevirt" - componentAlertLabelKey = "kubernetes_operator_component" - componentAlertLabelValue = "hyperconverged-cluster-operator" - ruleName = hcoutil.HyperConvergedName + "-prometheus-rule" - defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" - runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" + ruleName = hcoutil.HyperConvergedName + "-prometheus-rule" + defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" + runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" ) -type runbookCreator struct { - template string +type AlertRuleReconciler struct { + theRule *promv1.PrometheusRule } -func newRunbookCreator() *runbookCreator { - runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) - if !exists { - runbookURLTemplate = defaultRunbookURLTemplate - } - - if strings.Count(runbookURLTemplate, "%s") != 1 { - panic(errors.New("runbooks URL template must have exactly 1 %s substring")) +// newAlertRuleReconciler creates new AlertRuleReconciler instance and returns a pointer to it. +func newAlertRuleReconciler(namespace string, owner metav1.OwnerReference) (*AlertRuleReconciler, error) { + err := rules.SetupRules() + if err != nil { + return nil, err } - return &runbookCreator{ - template: runbookURLTemplate, + rule, err := rules.BuildPrometheusRule(namespace, owner) + if err != nil { + return nil, err } -} - -func (r runbookCreator) getURL(alertName string) string { - return fmt.Sprintf(r.template, alertName) -} - -type AlertRuleReconciler struct { - theRule *monitoringv1.PrometheusRule -} -// newAlertRuleReconciler creates new AlertRuleReconciler instance and returns a pointer to it. -func newAlertRuleReconciler(namespace string, owner metav1.OwnerReference) *AlertRuleReconciler { return &AlertRuleReconciler{ - theRule: newPrometheusRule(namespace, owner), - } + theRule: rule, + }, nil } func (r *AlertRuleReconciler) Kind() string { - return monitoringv1.PrometheusRuleKind + return promv1.PrometheusRuleKind } func (r *AlertRuleReconciler) ResourceName() string { @@ -87,12 +53,12 @@ func (r *AlertRuleReconciler) GetFullResource() client.Object { } func (r *AlertRuleReconciler) EmptyObject() client.Object { - return &monitoringv1.PrometheusRule{} + return &promv1.PrometheusRule{} } func (r *AlertRuleReconciler) UpdateExistingResource(ctx context.Context, cl client.Client, existing client.Object, logger logr.Logger) (client.Object, bool, error) { needUpdate := false - rule := existing.(*monitoringv1.PrometheusRule) + rule := existing.(*promv1.PrometheusRule) if !reflect.DeepEqual(r.theRule.Spec, rule.Spec) { needUpdate = true r.theRule.Spec.DeepCopyInto(&rule.Spec) @@ -112,124 +78,3 @@ func (r *AlertRuleReconciler) UpdateExistingResource(ctx context.Context, cl cli return rule, needUpdate, nil } - -func newPrometheusRule(namespace string, owner metav1.OwnerReference) *monitoringv1.PrometheusRule { - return &monitoringv1.PrometheusRule{ - TypeMeta: metav1.TypeMeta{ - APIVersion: monitoringv1.SchemeGroupVersion.String(), - Kind: "PrometheusRule", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: ruleName, - Labels: hcoutil.GetLabels(hcoutil.HyperConvergedName, hcoutil.AppComponentMonitoring), - Namespace: namespace, - OwnerReferences: []metav1.OwnerReference{owner}, - }, - Spec: *NewPrometheusRuleSpec(), - } -} - -// NewPrometheusRuleSpec creates PrometheusRuleSpec for alert rules -func NewPrometheusRuleSpec() *monitoringv1.PrometheusRuleSpec { - runbookCreator := newRunbookCreator() - - spec := &monitoringv1.PrometheusRuleSpec{ - Groups: []monitoringv1.RuleGroup{{ - Name: alertRuleGroup, - Rules: []monitoringv1.Rule{ - createOutOfBandUpdateAlertRule(), - createUnsafeModificationAlertRule(), - createInstallationNotCompletedAlertRule(), - createRequestCPUCoresRule(), - createOperatorHealthStatusRule(), - createSingleStackIPv6AlertRule(), - }, - }}, - } - - for _, rule := range spec.Groups[0].Rules { - if rule.Alert != "" { - rule.Annotations["runbook_url"] = runbookCreator.getURL(rule.Alert) - rule.Labels[partOfAlertLabelKey] = partOfAlertLabelValue - rule.Labels[componentAlertLabelKey] = componentAlertLabelValue - } - } - - return spec -} - -func createOutOfBandUpdateAlertRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Alert: outOfBandUpdateAlert, - Expr: intstr.FromString("sum by(component_name) ((round(increase(kubevirt_hco_out_of_band_modifications_total[10m]))>0 and kubevirt_hco_out_of_band_modifications_total offset 10m) or (kubevirt_hco_out_of_band_modifications_total != 0 unless kubevirt_hco_out_of_band_modifications_total offset 10m))"), - Annotations: map[string]string{ - "description": "Out-of-band modification for {{ $labels.component_name }}.", - "summary": "{{ $value }} out-of-band CR modifications were detected in the last 10 minutes.", - }, - Labels: map[string]string{ - severityAlertLabelKey: "warning", - healthImpactAlertLabelKey: "warning", - }, - } -} - -func createUnsafeModificationAlertRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Alert: unsafeModificationAlert, - Expr: intstr.FromString("sum by(annotation_name) ((kubevirt_hco_unsafe_modifications)>0)"), - Annotations: map[string]string{ - "description": "unsafe modification for the {{ $labels.annotation_name }} annotation in the HyperConverged resource.", - "summary": "{{ $value }} unsafe modifications were detected in the HyperConverged resource.", - }, - Labels: map[string]string{ - severityAlertLabelKey: "info", - healthImpactAlertLabelKey: "none", - }, - } -} - -func createInstallationNotCompletedAlertRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Alert: installationNotCompletedAlert, - Expr: intstr.FromString("kubevirt_hco_hyperconverged_cr_exists == 0"), - Annotations: map[string]string{ - "description": "the installation was not completed; the HyperConverged custom resource is missing. In order to complete the installation of the Hyperconverged Cluster Operator you should create the HyperConverged custom resource.", - "summary": "the installation was not completed; to complete the installation, create a HyperConverged custom resource.", - }, - For: ptr.To[monitoringv1.Duration]("1h"), - Labels: map[string]string{ - severityAlertLabelKey: "info", - healthImpactAlertLabelKey: "critical", - }, - } -} - -// Recording rules for openshift/cluster-monitoring-operator -func createRequestCPUCoresRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Record: "cluster:vmi_request_cpu_cores:sum", - Expr: intstr.FromString(`sum(kube_pod_container_resource_requests{resource="cpu"} and on (pod) kube_pod_status_phase{phase="Running"} * on (pod) group_left kube_pod_labels{ label_kubevirt_io="virt-launcher"} > 0)`), - } -} - -func createOperatorHealthStatusRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Record: "kubevirt_hyperconverged_operator_health_status", - Expr: intstr.FromString(`label_replace(vector(2) and on() ((kubevirt_hco_system_health_status>1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="critical"})>0)) or (vector(1) and on() ((kubevirt_hco_system_health_status==1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"})>0))) or vector(0),"name","kubevirt-hyperconverged","","")`), - } -} - -func createSingleStackIPv6AlertRule() monitoringv1.Rule { - return monitoringv1.Rule{ - Alert: singleStackIPv6Alert, - Expr: intstr.FromString("kubevirt_hco_single_stack_ipv6 == 1"), - Annotations: map[string]string{ - "description": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster", - "summary": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster", - }, - Labels: map[string]string{ - severityAlertLabelKey: "critical", - healthImpactAlertLabelKey: "critical", - }, - } -} diff --git a/controllers/alerts/metrics_test.go b/controllers/alerts/metrics_test.go index b9b8bb61d0..3c66d908f4 100644 --- a/controllers/alerts/metrics_test.go +++ b/controllers/alerts/metrics_test.go @@ -7,21 +7,23 @@ import ( "os" "testing" - "k8s.io/utils/ptr" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + + "github.com/machadovilaca/operator-observability/pkg/operatorrules" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/common" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/commontestutils" "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules" hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" ) @@ -48,6 +50,8 @@ var _ = Describe("alert tests", func() { } req = commontestutils.NewReq(nil) + + operatorrules.CleanRegistry() }) Context("test reconciler", func() { @@ -125,10 +129,14 @@ var _ = Describe("alert tests", func() { Context("test PrometheusRule", func() { BeforeEach(func() { currentMetric, _ = metrics.GetOverwrittenModificationsCount(monitoringv1.PrometheusRuleKind, ruleName) + + err := rules.SetupRules() + Expect(err).ToNot(HaveOccurred()) }) AfterEach(func() { - os.Unsetenv(runbookURLTemplateEnv) + err := os.Unsetenv(runbookURLTemplateEnv) + Expect(err).ToNot(HaveOccurred()) }) expectedEvents := []commontestutils.MockEvent{ @@ -141,7 +149,8 @@ var _ = Describe("alert tests", func() { It("should update the labels if modified", func() { owner := getDeploymentReference(ci.GetDeployment()) - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) existRule.Labels = map[string]string{ "wrongKey1": "wrongValue1", "wrongKey2": "wrongValue2", @@ -162,7 +171,8 @@ var _ = Describe("alert tests", func() { It("should add the labels if it's missing", func() { owner := getDeploymentReference(ci.GetDeployment()) - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) existRule.Labels = nil cl := commontestutils.InitClient([]client.Object{ns, existRule}) @@ -186,7 +196,8 @@ var _ = Describe("alert tests", func() { BlockOwnerDeletion: ptr.To(true), UID: "0987654321", } - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) cl := commontestutils.InitClient([]client.Object{ns, existRule}) r := NewMonitoringReconciler(ci, cl, ee, commontestutils.GetScheme()) @@ -217,7 +228,8 @@ var _ = Describe("alert tests", func() { BlockOwnerDeletion: ptr.To(true), UID: "0987654321", } - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) cl := commontestutils.InitClient([]client.Object{ns, existRule}) r := NewMonitoringReconciler(ci, cl, ee, commontestutils.GetScheme()) @@ -247,7 +259,8 @@ var _ = Describe("alert tests", func() { It("should update the referenceOwner if missing", func() { owner := metav1.OwnerReference{} - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) existRule.OwnerReferences = nil cl := commontestutils.InitClient([]client.Object{ns, existRule}) r := NewMonitoringReconciler(ci, cl, ee, commontestutils.GetScheme()) @@ -270,15 +283,16 @@ var _ = Describe("alert tests", func() { It("should update the spec if modified", func() { owner := getDeploymentReference(ci.GetDeployment()) - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) - existRule.Spec.Groups[0].Rules = []monitoringv1.Rule{ - existRule.Spec.Groups[0].Rules[0], - existRule.Spec.Groups[0].Rules[2], - existRule.Spec.Groups[0].Rules[3], + existRule.Spec.Groups[1].Rules = []monitoringv1.Rule{ + existRule.Spec.Groups[1].Rules[0], + existRule.Spec.Groups[1].Rules[2], + existRule.Spec.Groups[1].Rules[3], } // modify the first rule - existRule.Spec.Groups[0].Rules[0].Alert = "modified alert" + existRule.Spec.Groups[1].Rules[0].Alert = "modified alert" cl := commontestutils.InitClient([]client.Object{ns, existRule}) r := NewMonitoringReconciler(ci, cl, ee, commontestutils.GetScheme()) @@ -286,7 +300,9 @@ var _ = Describe("alert tests", func() { Expect(r.Reconcile(req, false)).Should(Succeed()) pr := &monitoringv1.PrometheusRule{} Expect(cl.Get(context.Background(), client.ObjectKey{Namespace: r.namespace, Name: ruleName}, pr)).Should(Succeed()) - Expect(pr.Spec).Should(Equal(*NewPrometheusRuleSpec())) + newRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) + Expect(pr.Spec).Should(Equal(newRule.Spec)) Expect(ee.CheckEvents(expectedEvents)).To(BeTrue()) Expect(metrics.GetOverwrittenModificationsCount(monitoringv1.PrometheusRuleKind, ruleName)).Should(BeEquivalentTo(currentMetric)) @@ -294,7 +310,8 @@ var _ = Describe("alert tests", func() { It("should update the spec if it's missing", func() { owner := getDeploymentReference(ci.GetDeployment()) - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) existRule.Spec = monitoringv1.PrometheusRuleSpec{} @@ -304,7 +321,9 @@ var _ = Describe("alert tests", func() { Expect(r.Reconcile(req, false)).Should(Succeed()) pr := &monitoringv1.PrometheusRule{} Expect(cl.Get(context.Background(), client.ObjectKey{Namespace: r.namespace, Name: ruleName}, pr)).Should(Succeed()) - Expect(pr.Spec).Should(Equal(*NewPrometheusRuleSpec())) + newRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) + Expect(pr.Spec).Should(Equal(newRule.Spec)) Expect(ee.CheckEvents(expectedEvents)).To(BeTrue()) Expect(metrics.GetOverwrittenModificationsCount(monitoringv1.PrometheusRuleKind, ruleName)).Should(BeEquivalentTo(currentMetric)) @@ -312,7 +331,8 @@ var _ = Describe("alert tests", func() { It("should use the default runbook URL template when no ENV Variable is set", func() { owner := getDeploymentReference(ci.GetDeployment()) - promRule := newPrometheusRule(commontestutils.Namespace, owner) + promRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) for _, group := range promRule.Spec.Groups { for _, rule := range group.Rules { @@ -326,11 +346,17 @@ var _ = Describe("alert tests", func() { }) It("should use the desired runbook URL template when its ENV Variable is set", func() { + operatorrules.CleanRegistry() + desiredRunbookURLTemplate := "desired/runbookURL/template/%s" os.Setenv(runbookURLTemplateEnv, desiredRunbookURLTemplate) + err := rules.SetupRules() + Expect(err).ToNot(HaveOccurred()) + owner := getDeploymentReference(ci.GetDeployment()) - promRule := newPrometheusRule(commontestutils.Namespace, owner) + promRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) for _, group := range promRule.Spec.Groups { for _, rule := range group.Rules { @@ -355,7 +381,8 @@ var _ = Describe("alert tests", func() { BlockOwnerDeletion: ptr.To(true), UID: "0987654321", } - existRule := newPrometheusRule(commontestutils.Namespace, owner) + existRule, err := rules.BuildPrometheusRule(commontestutils.Namespace, owner) + Expect(err).ToNot(HaveOccurred()) cl := commontestutils.InitClient([]client.Object{ns, existRule}) r := NewMonitoringReconciler(ci, cl, ee, commontestutils.GetScheme()) diff --git a/controllers/alerts/reconciler.go b/controllers/alerts/reconciler.go index 5c9ded9097..9ed8005d8d 100644 --- a/controllers/alerts/reconciler.go +++ b/controllers/alerts/reconciler.go @@ -13,6 +13,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/common" "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" @@ -36,14 +37,21 @@ type MonitoringReconciler struct { eventEmitter hcoutil.EventEmitter } +var logger = logf.Log.WithName("hyperconverged-operator-monitoring-reconciler") + func NewMonitoringReconciler(ci hcoutil.ClusterInfo, cl client.Client, ee hcoutil.EventEmitter, scheme *runtime.Scheme) *MonitoringReconciler { deployment := ci.GetDeployment() namespace := deployment.Namespace owner := getDeploymentReference(deployment) + alertRuleReconciler, err := newAlertRuleReconciler(namespace, owner) + if err != nil { + logger.Error(err, "failed to create the 'PrometheusRule' reconciler") + } + return &MonitoringReconciler{ reconcilers: []MetricReconciler{ - newAlertRuleReconciler(namespace, owner), + alertRuleReconciler, newRoleReconciler(namespace, owner), newRoleBindingReconciler(namespace, owner, ci), newMetricServiceReconciler(namespace, owner), diff --git a/controllers/hyperconverged/hyperconverged_controller_test.go b/controllers/hyperconverged/hyperconverged_controller_test.go index 1f2b7a4c52..dc0172ecea 100644 --- a/controllers/hyperconverged/hyperconverged_controller_test.go +++ b/controllers/hyperconverged/hyperconverged_controller_test.go @@ -13,6 +13,7 @@ import ( openshiftconfigv1 "github.com/openshift/api/config/v1" consolev1 "github.com/openshift/api/console/v1" conditionsv1 "github.com/openshift/custom-resource-status/conditions/v1" + objectreferencesv1 "github.com/openshift/custom-resource-status/objectreferences/v1" v1 "github.com/openshift/custom-resource-status/objectreferences/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" corev1 "k8s.io/api/core/v1" @@ -31,6 +32,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/reconcile" + kubevirtcorev1 "kubevirt.io/api/core/v1" + cdiv1beta1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" + sspv1beta2 "kubevirt.io/ssp-operator/api/v1beta2" + hcov1beta1 "github.com/kubevirt/hyperconverged-cluster-operator/api/v1beta1" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/alerts" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/common" @@ -39,11 +44,6 @@ import ( "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" "github.com/kubevirt/hyperconverged-cluster-operator/version" - kubevirtcorev1 "kubevirt.io/api/core/v1" - cdiv1beta1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" - sspv1beta2 "kubevirt.io/ssp-operator/api/v1beta2" - - objectreferencesv1 "github.com/openshift/custom-resource-status/objectreferences/v1" ) // name and namespace of our primary resource diff --git a/controllers/hyperconverged/testUtils_test.go b/controllers/hyperconverged/testUtils_test.go index 794eb6499f..625d32cbc3 100644 --- a/controllers/hyperconverged/testUtils_test.go +++ b/controllers/hyperconverged/testUtils_test.go @@ -25,6 +25,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + kubevirtcorev1 "kubevirt.io/api/core/v1" + cdiv1beta1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" + sspv1beta2 "kubevirt.io/ssp-operator/api/v1beta2" + networkaddonsv1 "github.com/kubevirt/cluster-network-addons-operator/pkg/apis/networkaddonsoperator/v1" hcov1beta1 "github.com/kubevirt/hyperconverged-cluster-operator/api/v1beta1" "github.com/kubevirt/hyperconverged-cluster-operator/controllers/alerts" @@ -34,9 +38,6 @@ import ( "github.com/kubevirt/hyperconverged-cluster-operator/pkg/components" hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" "github.com/kubevirt/hyperconverged-cluster-operator/version" - kubevirtcorev1 "kubevirt.io/api/core/v1" - cdiv1beta1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" - sspv1beta2 "kubevirt.io/ssp-operator/api/v1beta2" ) // Mock TestRequest to simulate Reconcile() being called on an event for a watched resource diff --git a/docs/metrics.md b/docs/metrics.md index 6c1b3d2050..200c7eaa35 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,5 +1,8 @@ # Hyperconverged Cluster Operator metrics +### cluster:vmi_request_cpu_cores:sum +Sum of CPU core requests for all running virt-launcher VMIs across the entire Kubevirt cluster. Type: Gauge. + ### kubevirt_hco_hyperconverged_cr_exists Indicates whether the HyperConverged custom resource exists (1) or not (0). Type: Gauge. diff --git a/hack/prom-rule-ci/rule-spec-dumper.go b/hack/prom-rule-ci/rule-spec-dumper.go index b4f4e0fa88..3abf56fa62 100644 --- a/hack/prom-rule-ci/rule-spec-dumper.go +++ b/hack/prom-rule-ci/rule-spec-dumper.go @@ -2,11 +2,12 @@ package main import ( "encoding/json" - - "github.com/kubevirt/hyperconverged-cluster-operator/controllers/alerts" - "fmt" "os" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules" ) func verifyArgs() error { @@ -25,8 +26,24 @@ func main() { targetFile := os.Args[1] - promRuleSpec := alerts.NewPrometheusRuleSpec() - b, err := json.Marshal(promRuleSpec) + err := rules.SetupRules() + if err != nil { + panic(err) + } + + promRule, err := rules.BuildPrometheusRule( + "kubevirt-hyperconverged", + metav1.OwnerReference{ + APIVersion: "v1", + Kind: "Namespace", + Name: "kubevirt-hyperconverged", + }, + ) + if err != nil { + panic(err) + } + + b, err := json.Marshal(promRule.Spec) if err != nil { panic(err) } diff --git a/pkg/monitoring/rules/alerts/operator_alerts.go b/pkg/monitoring/rules/alerts/operator_alerts.go new file mode 100644 index 0000000000..afc71003d4 --- /dev/null +++ b/pkg/monitoring/rules/alerts/operator_alerts.go @@ -0,0 +1,70 @@ +package alerts + +import ( + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" +) + +const ( + outOfBandUpdateAlert = "KubeVirtCRModified" + unsafeModificationAlert = "UnsupportedHCOModification" + installationNotCompletedAlert = "HCOInstallationIncomplete" + singleStackIPv6Alert = "SingleStackIPv6Unsupported" + severityAlertLabelKey = "severity" + healthImpactAlertLabelKey = "operator_health_impact" +) + +func operatorAlerts() []promv1.Rule { + return []promv1.Rule{ + { + Alert: outOfBandUpdateAlert, + Expr: intstr.FromString("sum by(component_name) ((round(increase(kubevirt_hco_out_of_band_modifications_total[10m]))>0 and kubevirt_hco_out_of_band_modifications_total offset 10m) or (kubevirt_hco_out_of_band_modifications_total != 0 unless kubevirt_hco_out_of_band_modifications_total offset 10m))"), + Annotations: map[string]string{ + "description": "Out-of-band modification for {{ $labels.component_name }}.", + "summary": "{{ $value }} out-of-band CR modifications were detected in the last 10 minutes.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "warning", + healthImpactAlertLabelKey: "warning", + }, + }, + { + Alert: unsafeModificationAlert, + Expr: intstr.FromString("sum by(annotation_name) ((kubevirt_hco_unsafe_modifications)>0)"), + Annotations: map[string]string{ + "description": "unsafe modification for the {{ $labels.annotation_name }} annotation in the HyperConverged resource.", + "summary": "{{ $value }} unsafe modifications were detected in the HyperConverged resource.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "info", + healthImpactAlertLabelKey: "none", + }, + }, + { + Alert: installationNotCompletedAlert, + Expr: intstr.FromString("kubevirt_hco_hyperconverged_cr_exists == 0"), + For: ptr.To(promv1.Duration("1h")), + Annotations: map[string]string{ + "description": "the installation was not completed; the HyperConverged custom resource is missing. In order to complete the installation of the Hyperconverged Cluster Operator you should create the HyperConverged custom resource.", + "summary": "the installation was not completed; to complete the installation, create a HyperConverged custom resource.", + }, + Labels: map[string]string{ + severityAlertLabelKey: "info", + healthImpactAlertLabelKey: "critical", + }, + }, + { + Alert: singleStackIPv6Alert, + Expr: intstr.FromString("kubevirt_hco_single_stack_ipv6 == 1"), + Annotations: map[string]string{ + "description": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster", + "summary": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster", + }, + Labels: map[string]string{ + severityAlertLabelKey: "critical", + healthImpactAlertLabelKey: "critical", + }, + }, + } +} diff --git a/pkg/monitoring/rules/alerts/prometheus.go b/pkg/monitoring/rules/alerts/prometheus.go new file mode 100644 index 0000000000..a1d573e6de --- /dev/null +++ b/pkg/monitoring/rules/alerts/prometheus.go @@ -0,0 +1,52 @@ +package alerts + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" +) + +const ( + prometheusRunbookAnnotationKey = "runbook_url" + partOfAlertLabelKey = "kubernetes_operator_part_of" + partOfAlertLabelValue = "kubevirt" + componentAlertLabelKey = "kubernetes_operator_component" + componentAlertLabelValue = "hyperconverged-cluster-operator" + defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" + runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" +) + +func Register() error { + alerts := [][]promv1.Rule{ + operatorAlerts(), + } + + runbookURLTemplate := getRunbookURLTemplate() + for _, alertGroup := range alerts { + for _, alert := range alertGroup { + alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue + alert.Labels[componentAlertLabelKey] = componentAlertLabelValue + alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert) + } + + } + + return operatorrules.RegisterAlerts(alerts...) +} + +func getRunbookURLTemplate() string { + runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv) + if !exists { + runbookURLTemplate = defaultRunbookURLTemplate + } + + if strings.Count(runbookURLTemplate, "%s") != 1 { + panic(errors.New("runbook URL template must have exactly 1 %s substring")) + } + + return runbookURLTemplate +} diff --git a/pkg/monitoring/rules/recordingrules/operator.go b/pkg/monitoring/rules/recordingrules/operator.go new file mode 100644 index 0000000000..871b2185a8 --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/operator.go @@ -0,0 +1,26 @@ +package recordingrules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatormetrics" + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + "k8s.io/apimachinery/pkg/util/intstr" +) + +var operatorRecordingRules = []operatorrules.RecordingRule{ + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "kubevirt_hyperconverged_operator_health_status", + Help: "Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(`label_replace(vector(2) and on() ((kubevirt_hco_system_health_status>1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="critical"})>0)) or (vector(1) and on() ((kubevirt_hco_system_health_status==1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"})>0))) or vector(0),"name","kubevirt-hyperconverged","","")`), + }, + { + MetricsOpts: operatormetrics.MetricOpts{ + Name: "cluster:vmi_request_cpu_cores:sum", + Help: "Sum of CPU core requests for all running virt-launcher VMIs across the entire Kubevirt cluster", + }, + MetricType: operatormetrics.GaugeType, + Expr: intstr.FromString(`sum(kube_pod_container_resource_requests{resource="cpu"} and on (pod) kube_pod_status_phase{phase="Running"} * on (pod) group_left kube_pod_labels{ label_kubevirt_io="virt-launcher"} > 0)`), + }, +} diff --git a/pkg/monitoring/rules/recordingrules/recordingrules.go b/pkg/monitoring/rules/recordingrules/recordingrules.go new file mode 100644 index 0000000000..3b95c3ae6f --- /dev/null +++ b/pkg/monitoring/rules/recordingrules/recordingrules.go @@ -0,0 +1,9 @@ +package recordingrules + +import "github.com/machadovilaca/operator-observability/pkg/operatorrules" + +func Register() error { + return operatorrules.RegisterRecordingRules( + operatorRecordingRules, + ) +} diff --git a/pkg/monitoring/rules/rules.go b/pkg/monitoring/rules/rules.go new file mode 100644 index 0000000000..df2b7846ba --- /dev/null +++ b/pkg/monitoring/rules/rules.go @@ -0,0 +1,52 @@ +package rules + +import ( + "github.com/machadovilaca/operator-observability/pkg/operatorrules" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules/alerts" + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules/recordingrules" + hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util" +) + +const ( + ruleName = hcoutil.HyperConvergedName + "-prometheus-rule" +) + +func SetupRules() error { + err := recordingrules.Register() + if err != nil { + return err + } + + err = alerts.Register() + if err != nil { + return err + } + + return nil +} + +func BuildPrometheusRule(namespace string, owner metav1.OwnerReference) (*promv1.PrometheusRule, error) { + rules, err := operatorrules.BuildPrometheusRule( + ruleName, + namespace, + hcoutil.GetLabels(hcoutil.HyperConvergedName, hcoutil.AppComponentMonitoring), + ) + if err != nil { + return nil, err + } + + rules.OwnerReferences = []metav1.OwnerReference{owner} + + return rules, nil +} + +func ListRecordingRules() []operatorrules.RecordingRule { + return operatorrules.ListRecordingRules() +} + +func ListAlerts() []promv1.Rule { + return operatorrules.ListAlerts() +} diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 94394ee9d8..f8d8f987ac 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -6,6 +6,7 @@ import ( "github.com/machadovilaca/operator-observability/pkg/docs" "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules" ) const tpl = `# Hyperconverged Cluster Operator metrics @@ -27,9 +28,6 @@ const tpl = `# Hyperconverged Cluster Operator metrics {{- end }} -### kubevirt_hyperconverged_operator_health_status -Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric. Type: Gauge. - ## Developing new metrics All metrics documented here are auto-generated and reflect exactly what is being @@ -43,8 +41,14 @@ func main() { panic(err) } + err = rules.SetupRules() + if err != nil { + panic(err) + } + metricsList := metrics.ListMetrics() + rulesList := rules.ListRecordingRules() - docsString := docs.BuildMetricsDocsWithCustomTemplate(metricsList, nil, tpl) + docsString := docs.BuildMetricsDocsWithCustomTemplate(metricsList, rulesList, tpl) fmt.Print(docsString) } diff --git a/tools/prom-metrics-collector/metrics_json_generator.go b/tools/prom-metrics-collector/metrics_json_generator.go index 24dbf86865..2ad09f4499 100644 --- a/tools/prom-metrics-collector/metrics_json_generator.go +++ b/tools/prom-metrics-collector/metrics_json_generator.go @@ -23,8 +23,10 @@ import ( "fmt" "strings" - "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" "github.com/kubevirt/monitoring/pkg/metrics/parser" + + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/metrics" + "github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules" ) // This should be used only for very rare cases where the naming conventions that are explained in the best practices: @@ -32,6 +34,7 @@ import ( // should be ignored. var excludedMetrics = map[string]struct{}{ "kubevirt_hyperconverged_operator_health_status": struct{}{}, + "cluster:vmi_request_cpu_cores:sum": struct{}{}, } func main() { @@ -42,6 +45,13 @@ func main() { metricsList := metrics.ListMetrics() + err = rules.SetupRules() + if err != nil { + panic(err) + } + + rulesList := rules.ListRecordingRules() + var metricFamilies []parser.Metric for _, m := range metricsList { if _, isExcludedMetric := excludedMetrics[m.GetOpts().Name]; !isExcludedMetric { @@ -53,6 +63,16 @@ func main() { } } + for _, r := range rulesList { + if _, isExcludedMetric := excludedMetrics[r.GetOpts().Name]; !isExcludedMetric { + metricFamilies = append(metricFamilies, parser.Metric{ + Name: r.GetOpts().Name, + Help: r.GetOpts().Help, + Type: strings.ToUpper(string(r.GetType())), + }) + } + } + jsonBytes, err := json.Marshal(metricFamilies) if err != nil { panic(err)