Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor recording-rules and alerts code #2706

Merged
merged 1 commit into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 20 additions & 175 deletions controllers/alerts/alerts.go
Original file line number Diff line number Diff line change
@@ -1,81 +1,47 @@
package alerts

// This package makes sure that the PrometheusRule is present with the right configurations.
// This code was taken out of the operator package, because the operand reconciliation is done
// only if the HyperConverged CR is present. But we need the alert in place even if the CR was
// not created.

import (
"context"
"errors"
"fmt"
"os"
"reflect"
"strings"

"github.com/go-logr/logr"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/monitoring/rules"
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

const (
alertRuleGroup = "kubevirt.hyperconverged.rules"
outOfBandUpdateAlert = "KubeVirtCRModified"
unsafeModificationAlert = "UnsupportedHCOModification"
installationNotCompletedAlert = "HCOInstallationIncomplete"
singleStackIPv6Alert = "SingleStackIPv6Unsupported"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = "hyperconverged-cluster-operator"
ruleName = hcoutil.HyperConvergedName + "-prometheus-rule"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
ruleName = hcoutil.HyperConvergedName + "-prometheus-rule"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
avlitman marked this conversation as resolved.
Show resolved Hide resolved
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
)

type runbookCreator struct {
template string
type AlertRuleReconciler struct {
theRule *promv1.PrometheusRule
}

func newRunbookCreator() *runbookCreator {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}

if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbooks URL template must have exactly 1 %s substring"))
// newAlertRuleReconciler creates new AlertRuleReconciler instance and returns a pointer to it.
func newAlertRuleReconciler(namespace string, owner metav1.OwnerReference) (*AlertRuleReconciler, error) {
err := rules.SetupRules()
if err != nil {
return nil, err
}

return &runbookCreator{
template: runbookURLTemplate,
rule, err := rules.BuildPrometheusRule(namespace, owner)
if err != nil {
return nil, err
}
}

func (r runbookCreator) getURL(alertName string) string {
return fmt.Sprintf(r.template, alertName)
}

type AlertRuleReconciler struct {
theRule *monitoringv1.PrometheusRule
}

// newAlertRuleReconciler creates new AlertRuleReconciler instance and returns a pointer to it.
func newAlertRuleReconciler(namespace string, owner metav1.OwnerReference) *AlertRuleReconciler {
return &AlertRuleReconciler{
theRule: newPrometheusRule(namespace, owner),
}
theRule: rule,
}, nil
}

func (r *AlertRuleReconciler) Kind() string {
return monitoringv1.PrometheusRuleKind
return promv1.PrometheusRuleKind
}

func (r *AlertRuleReconciler) ResourceName() string {
Expand All @@ -87,12 +53,12 @@ func (r *AlertRuleReconciler) GetFullResource() client.Object {
}

func (r *AlertRuleReconciler) EmptyObject() client.Object {
return &monitoringv1.PrometheusRule{}
return &promv1.PrometheusRule{}
}

func (r *AlertRuleReconciler) UpdateExistingResource(ctx context.Context, cl client.Client, existing client.Object, logger logr.Logger) (client.Object, bool, error) {
needUpdate := false
rule := existing.(*monitoringv1.PrometheusRule)
rule := existing.(*promv1.PrometheusRule)
if !reflect.DeepEqual(r.theRule.Spec, rule.Spec) {
needUpdate = true
r.theRule.Spec.DeepCopyInto(&rule.Spec)
Expand All @@ -112,124 +78,3 @@ func (r *AlertRuleReconciler) UpdateExistingResource(ctx context.Context, cl cli

return rule, needUpdate, nil
}

func newPrometheusRule(namespace string, owner metav1.OwnerReference) *monitoringv1.PrometheusRule {
return &monitoringv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{
APIVersion: monitoringv1.SchemeGroupVersion.String(),
Kind: "PrometheusRule",
},
ObjectMeta: metav1.ObjectMeta{
Name: ruleName,
Labels: hcoutil.GetLabels(hcoutil.HyperConvergedName, hcoutil.AppComponentMonitoring),
Namespace: namespace,
OwnerReferences: []metav1.OwnerReference{owner},
},
Spec: *NewPrometheusRuleSpec(),
}
}

// NewPrometheusRuleSpec creates PrometheusRuleSpec for alert rules
func NewPrometheusRuleSpec() *monitoringv1.PrometheusRuleSpec {
runbookCreator := newRunbookCreator()

spec := &monitoringv1.PrometheusRuleSpec{
Groups: []monitoringv1.RuleGroup{{
Name: alertRuleGroup,
Rules: []monitoringv1.Rule{
createOutOfBandUpdateAlertRule(),
createUnsafeModificationAlertRule(),
createInstallationNotCompletedAlertRule(),
createRequestCPUCoresRule(),
createOperatorHealthStatusRule(),
createSingleStackIPv6AlertRule(),
},
}},
}

for _, rule := range spec.Groups[0].Rules {
if rule.Alert != "" {
rule.Annotations["runbook_url"] = runbookCreator.getURL(rule.Alert)
rule.Labels[partOfAlertLabelKey] = partOfAlertLabelValue
rule.Labels[componentAlertLabelKey] = componentAlertLabelValue
}
}

return spec
}

func createOutOfBandUpdateAlertRule() monitoringv1.Rule {
return monitoringv1.Rule{
Alert: outOfBandUpdateAlert,
Expr: intstr.FromString("sum by(component_name) ((round(increase(kubevirt_hco_out_of_band_modifications_total[10m]))>0 and kubevirt_hco_out_of_band_modifications_total offset 10m) or (kubevirt_hco_out_of_band_modifications_total != 0 unless kubevirt_hco_out_of_band_modifications_total offset 10m))"),
Annotations: map[string]string{
"description": "Out-of-band modification for {{ $labels.component_name }}.",
"summary": "{{ $value }} out-of-band CR modifications were detected in the last 10 minutes.",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
},
}
}

func createUnsafeModificationAlertRule() monitoringv1.Rule {
return monitoringv1.Rule{
Alert: unsafeModificationAlert,
Expr: intstr.FromString("sum by(annotation_name) ((kubevirt_hco_unsafe_modifications)>0)"),
Annotations: map[string]string{
"description": "unsafe modification for the {{ $labels.annotation_name }} annotation in the HyperConverged resource.",
"summary": "{{ $value }} unsafe modifications were detected in the HyperConverged resource.",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "none",
},
}
}

func createInstallationNotCompletedAlertRule() monitoringv1.Rule {
return monitoringv1.Rule{
Alert: installationNotCompletedAlert,
Expr: intstr.FromString("kubevirt_hco_hyperconverged_cr_exists == 0"),
Annotations: map[string]string{
"description": "the installation was not completed; the HyperConverged custom resource is missing. In order to complete the installation of the Hyperconverged Cluster Operator you should create the HyperConverged custom resource.",
"summary": "the installation was not completed; to complete the installation, create a HyperConverged custom resource.",
},
For: ptr.To[monitoringv1.Duration]("1h"),
Labels: map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "critical",
},
}
}

// Recording rules for openshift/cluster-monitoring-operator
func createRequestCPUCoresRule() monitoringv1.Rule {
return monitoringv1.Rule{
Record: "cluster:vmi_request_cpu_cores:sum",
Expr: intstr.FromString(`sum(kube_pod_container_resource_requests{resource="cpu"} and on (pod) kube_pod_status_phase{phase="Running"} * on (pod) group_left kube_pod_labels{ label_kubevirt_io="virt-launcher"} > 0)`),
}
}

func createOperatorHealthStatusRule() monitoringv1.Rule {
return monitoringv1.Rule{
Record: "kubevirt_hyperconverged_operator_health_status",
Expr: intstr.FromString(`label_replace(vector(2) and on() ((kubevirt_hco_system_health_status>1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="critical"})>0)) or (vector(1) and on() ((kubevirt_hco_system_health_status==1) or (count(ALERTS{kubernetes_operator_part_of="kubevirt", alertstate="firing", operator_health_impact="warning"})>0))) or vector(0),"name","kubevirt-hyperconverged","","")`),
}
}

func createSingleStackIPv6AlertRule() monitoringv1.Rule {
return monitoringv1.Rule{
Alert: singleStackIPv6Alert,
Expr: intstr.FromString("kubevirt_hco_single_stack_ipv6 == 1"),
Annotations: map[string]string{
"description": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster",
"summary": "KubeVirt Hyperconverged is not supported on a single stack IPv6 cluster",
},
Labels: map[string]string{
severityAlertLabelKey: "critical",
healthImpactAlertLabelKey: "critical",
},
}
}
Loading
Loading