Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parametrize promrule name #224

Merged
merged 9 commits into from
Sep 30, 2024
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ The changes should be grouped using the following categories (in order of preced

## [Unreleased](https://github.com/sapcc/absent-metrics-operator/compare/v0.9.4...HEAD)

### Added

- New `prom-rule-name` flag which can be used to provide a template for AbsencePrometheusRule name generation and consequently absence alert rules aggregation.
- Improved tests by adding dedicated unit tests for alert rule parsing and name generation edge-cases.

### Fixed

- Clean up of absence alert rules when a rule group is deleted.

### Removed

- Heuristic determination of `tier`, `service`, and `support_group` labels. These labels will now be copied over as is from the original alert rule to its corresponding absence alert rule.
Expand Down Expand Up @@ -67,7 +76,7 @@ The changes should be grouped using the following categories (in order of preced

### Fixed

- `-debug` flag.
- `debug` flag.

## 0.9.0 - 2022-11-02

Expand Down
136 changes: 90 additions & 46 deletions controllers/absence_prometheusrule.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
package controllers

import (
"bytes"
"context"
"errors"
"fmt"
"reflect"
"sort"
"strings"
"text/template"
"time"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand All @@ -30,37 +32,76 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const absencePromRuleNameSuffix = "-absent-metric-alert-rules"
const (
absencePromRuleNameSuffix = "-absent-metric-alert-rules"
DefaultAbsencePromRuleNameTemplate = `{{ if index .metadata.labels "thanos-ruler" }}{{ index .metadata.labels "thanos-ruler" }}{{ else }}{{ index .metadata.labels "prometheus" }}{{ end }}`
)

// AbsencePromRuleNameGenerator is a function type that takes a PrometheusRule and
// generates a name for its corresponding PrometheusRule that holds the generated absence
// alert rules.
type AbsencePromRuleNameGenerator func(*monitoringv1.PrometheusRule) (string, error)

// CreateAbsencePromRuleNameGenerator creates an absencePromRuleNameGenerator function
// based on a template string.
func CreateAbsencePromRuleNameGenerator(tmplStr string) (AbsencePromRuleNameGenerator, error) {
t, err := template.New("promRuleNameGenerator").Option("missingkey=error").Parse(tmplStr)
if err != nil {
return nil, err
}

return func(pr *monitoringv1.PrometheusRule) (string, error) {
// only a specific vetted subset of attributes is passed into the name template to avoid surprising behavior
data := map[string]any{
"metadata": map[string]any{
"annotations": pr.ObjectMeta.Annotations,
"labels": pr.ObjectMeta.Labels,
"namespace": pr.ObjectMeta.Namespace,
"name": pr.ObjectMeta.Name,
},
}

var buf bytes.Buffer
err = t.Execute(&buf, data)
if err != nil {
return "", fmt.Errorf("could not generate AbsencePrometheusRule name: %w", err)
}

// AbsencePrometheusRuleName returns the name of an AbsencePrometheusRule resource that
// holds the absence alert rules concerning a specific Prometheus server (e.g. openstack, kubernetes, etc.).
func AbsencePrometheusRuleName(promServer string) string {
return fmt.Sprintf("%s%s", promServer, absencePromRuleNameSuffix)
return buf.String() + absencePromRuleNameSuffix, nil
}, nil
}

func (r *PrometheusRuleReconciler) newAbsencePrometheusRule(namespace, promServer string) *monitoringv1.PrometheusRule {
func (r *PrometheusRuleReconciler) newAbsencePrometheusRule(name, namespace string, labels map[string]string) *monitoringv1.PrometheusRule {
l := map[string]string{
// Add a label that identifies that this PrometheusRule resource is
// created and managed by this operator.
labelOperatorManagedBy: "true",
"type": "alerting-rules",
}
// Carry over labels from source PrometheusRule object if needed.
if v, ok := labels[labelPrometheusServer]; ok {
l[labelPrometheusServer] = v
}
if v, ok := labels[labelThanosRuler]; ok {
l[labelThanosRuler] = v
}

return &monitoringv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: AbsencePrometheusRuleName(promServer),
Name: name,
Namespace: namespace,
Labels: map[string]string{
// Add a label that identifies that this PrometheusRule resource is
// created and managed by this operator.
labelOperatorManagedBy: "true",
labelPrometheusServer: promServer,
"type": "alerting-rules",
},
Labels: l,
},
}
}

func (r *PrometheusRuleReconciler) getExistingAbsencePrometheusRule(
ctx context.Context,
namespace, promServer string,
name, namespace string,
) (*monitoringv1.PrometheusRule, error) {

var absencePromRule monitoringv1.PrometheusRule
nsName := types.NamespacedName{Namespace: namespace, Name: AbsencePrometheusRuleName(promServer)}
nsName := types.NamespacedName{Namespace: namespace, Name: name}
if err := r.Get(ctx, nsName, &absencePromRule); err != nil {
return nil, err
}
Expand Down Expand Up @@ -134,21 +175,21 @@ var errCorrespondingAbsencePromRuleNotExists = errors.New("corresponding Absence
func (r *PrometheusRuleReconciler) cleanUpOrphanedAbsenceAlertRules(
ctx context.Context,
promRule types.NamespacedName,
promServer string,
absencePromRule string,
) error {

// Step 1: find the corresponding AbsencePrometheusRule that needs to be cleaned up.
var aPRToClean *monitoringv1.PrometheusRule
if promServer != "" {
if absencePromRule != "" {
var err error
if aPRToClean, err = r.getExistingAbsencePrometheusRule(ctx, promRule.Namespace, promServer); err != nil {
if aPRToClean, err = r.getExistingAbsencePrometheusRule(ctx, absencePromRule, promRule.Namespace); err != nil {
return err
}
} else {
// Since we don't know the Prometheus server for this PrometheusRule therefore we
// have to list all AbsencePrometheusRules in its namespace and find the specific
// AbsencePrometheusRule that contains the absence alert rules that were generated
// for this PrometheusRule.
// Since we don't know the corresponding AbsencePrometheusRule for this PrometheusRule
// therefore we have to list all AbsencePrometheusRules in the concerning namespace and
// find the specific AbsencePrometheusRule that contains the absence alert rules that
// were generated for this PrometheusRule.
var listOpts client.ListOptions
client.InNamespace(promRule.Namespace).ApplyToList(&listOpts)
client.HasLabels{labelOperatorManagedBy}.ApplyToList(&listOpts)
Expand Down Expand Up @@ -201,23 +242,30 @@ func (r *PrometheusRuleReconciler) cleanUpOrphanedAbsenceAlertRules(
// has the 'absent-metrics-operator/disable' label. If such rules are found then they are
// deleted.
func (r *PrometheusRuleReconciler) cleanUpAbsencePrometheusRule(ctx context.Context, absencePromRule *monitoringv1.PrometheusRule) error {
// Step 1: get names of all PrometheusRule resources in this namespace for the
// concerning Prometheus server.
// Step 1: get names of all PrometheusRule resources in this namespace.
var listOpts client.ListOptions
client.InNamespace(absencePromRule.GetNamespace()).ApplyToList(&listOpts)
client.MatchingLabels{
labelPrometheusServer: absencePromRule.Labels[labelPrometheusServer],
}.ApplyToList(&listOpts)
var promRules monitoringv1.PrometheusRuleList
if err := r.List(ctx, &promRules, &listOpts); err != nil {
return err
}

// Step 2: collect names of those PrometheusRule resources whose absence alert rules
// would end up in this AbsencePrometheusRule as per the name generation template.
aPRName := absencePromRule.GetName()
prNames := make(map[string]bool)
for _, pr := range promRules.Items {
prNames[pr.GetName()] = true
if _, ok := pr.Labels[labelOperatorManagedBy]; ok {
continue
}
if n, err := r.PrometheusRuleName(pr); err == nil {
if n == aPRName {
prNames[pr.GetName()] = true
}
}
}

// Step 2: iterate through all the AbsencePrometheusRule's RuleGroups and remove those
// Step 4: iterate through all the AbsencePrometheusRule's RuleGroups and remove those
// that don't belong to any PrometheusRule.
newRuleGroups := make([]monitoringv1.RuleGroup, 0, len(absencePromRule.Spec.Groups))
for _, g := range absencePromRule.Spec.Groups {
Expand All @@ -231,7 +279,7 @@ func (r *PrometheusRuleReconciler) cleanUpAbsencePrometheusRule(ctx context.Cont
return nil
}

// Step 3: if, after the cleanup, the AbsencePrometheusRule ends up being empty then
// Step 5: if, after the cleanup, the AbsencePrometheusRule ends up being empty then
// delete it otherwise update.
if len(newRuleGroups) == 0 {
return r.deleteAbsencePrometheusRule(ctx, absencePromRule)
Expand All @@ -248,22 +296,18 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,
namespace := promRule.GetNamespace()
log := r.Log.WithValues("name", promRuleName, "namespace", namespace)

// Step 1: find the Prometheus server for this resource.
promRuleLabels := promRule.GetLabels()
promServer, ok := promRuleLabels["prometheus"]
if !ok {
// Normally this shouldn't happen but just in case that it does.
return errors.New("no 'prometheus' label found")
}

// Step 2: get the corresponding AbsencePrometheusRule if it exists.
// Step 1: get the corresponding AbsencePrometheusRule if it exists.
existingAbsencePrometheusRule := false
absencePromRule, err := r.getExistingAbsencePrometheusRule(ctx, namespace, promServer)
aPRName, err := r.PrometheusRuleName(promRule)
if err != nil {
return err
}
absencePromRule, err := r.getExistingAbsencePrometheusRule(ctx, aPRName, namespace)
switch {
case err == nil:
existingAbsencePrometheusRule = true
case apierrors.IsNotFound(err):
absencePromRule = r.newAbsencePrometheusRule(namespace, promServer)
absencePromRule = r.newAbsencePrometheusRule(aPRName, namespace, promRule.GetLabels())
default:
// This could have been caused by a temporary network failure, or any
// other transient reason.
Expand All @@ -283,25 +327,25 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,
delete(absencePromRule.Labels, LabelTier)
}

// Step 3: parse RuleGroups and generate corresponding absence alert rules.
// Step 2: parse RuleGroups and generate corresponding absence alert rules.
absenceRuleGroups, err := ParseRuleGroups(log, promRule.Spec.Groups, promRuleName, r.KeepLabel)
if err != nil {
return err
}

// Step 4: we clean up orphaned absence alert rules from the AbsencePrometheusRule in
// Step 3: we clean up orphaned absence alert rules from the AbsencePrometheusRule in
// case no absence alert rules were generated.
// This can happen when changes have been made to alert rules that result in no absent
// alerts. E.g. absent() or the 'no_alert_on_absence' label was used.
if len(absenceRuleGroups) == 0 {
if existingAbsencePrometheusRule {
key := types.NamespacedName{Namespace: namespace, Name: promRuleName}
return r.cleanUpOrphanedAbsenceAlertRules(ctx, key, promServer)
return r.cleanUpOrphanedAbsenceAlertRules(ctx, key, aPRName)
}
return nil
}

// Step 5: if it's an existing AbsencePrometheusRule then update otherwise create a new resource.
// Step 4: if it's an existing AbsencePrometheusRule then update otherwise create a new resource.
if existingAbsencePrometheusRule {
existingRuleGroups := unmodifiedAbsencePromRule.Spec.Groups
result := mergeAbsenceRuleGroups(promRuleName, existingRuleGroups, absenceRuleGroups)
Expand Down
79 changes: 79 additions & 0 deletions controllers/absence_prometheusrule_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2024 SAP SE
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package controllers

import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var _ = Describe("AbsencePrometheusRule", func() {
pr := &monitoringv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: "foobar.alerts",
Namespace: "outerspace",
Labels: map[string]string{
"prometheus": "openstack",
"thanos-ruler": "titan",
},
},
}

DescribeTable("Name generation",
func(tmplStr, expected string, shouldFail bool) {
gen, err := CreateAbsencePromRuleNameGenerator(tmplStr)
Expect(err).ToNot(HaveOccurred())
actual, err := gen(pr)
if shouldFail {
Expect(err).To(HaveOccurred())
return
}
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(Equal(expected))
},
Entry("name that uses the original name",
`{{ .metadata.name }}`,
"foobar.alerts"+absencePromRuleNameSuffix,
false,
),
Entry("name that uses the namespace",
`{{ .metadata.namespace }}`,
"outerspace"+absencePromRuleNameSuffix,
false,
),
Entry("name that uses the original name and namespace",
`{{ .metadata.name }}-{{ .metadata.namespace }}`,
"foobar.alerts-outerspace"+absencePromRuleNameSuffix,
false,
),
Entry("name with prometheus label",
`{{ .metadata.labels.prometheus }}`,
"openstack"+absencePromRuleNameSuffix,
false,
),
Entry("name with thanos-ruler label if it exists",
DefaultAbsencePromRuleNameTemplate,
"titan"+absencePromRuleNameSuffix,
false,
),
Entry("name that references nonexistent metadata",
`{{ .metadata.doesntexist }}`,
"",
true,
),
)
})
1 change: 1 addition & 0 deletions controllers/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (

labelNoAlertOnAbsence = "no_alert_on_absence"
labelPrometheusServer = "prometheus"
labelThanosRuler = "thanos-ruler"
)

// KeepLabel specifies which labels to keep on an absence alert rule.
Expand Down
6 changes: 5 additions & 1 deletion controllers/prometheusrule_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type PrometheusRuleReconciler struct {
Scheme *runtime.Scheme
Log logr.Logger

PrometheusRuleName AbsencePromRuleNameGenerator
// KeepLabel is a map of labels that will be retained from the original alert rule and
// passed on to its corresponding absence alert rule.
KeepLabel KeepLabel
Expand Down Expand Up @@ -176,7 +177,10 @@ func (r *PrometheusRuleReconciler) reconcileObject(
// elapsed).
if parseBool(l[labelOperatorDisable]) {
log.V(logLevelDebug).Info("operator disabled for this PrometheusRule")
err := r.cleanUpOrphanedAbsenceAlertRules(ctx, key, l[labelPrometheusServer])
aPRName, err := r.PrometheusRuleName(obj)
if err == nil {
err = r.cleanUpOrphanedAbsenceAlertRules(ctx, key, aPRName)
}
if err != nil {
if !apierrors.IsNotFound(err) && !errors.Is(err, errCorrespondingAbsencePromRuleNotExists) {
log.Error(err, "could not clean up orphaned absence alert rules")
Expand Down
Loading
Loading