Skip to content

Commit

Permalink
Merge pull request #224 from sapcc/parametrize-promrule-name
Browse files Browse the repository at this point in the history
Parametrize promrule name
  • Loading branch information
majewsky authored Sep 30, 2024
2 parents 8c9832e + 8031e7a commit ab73636
Show file tree
Hide file tree
Showing 9 changed files with 228 additions and 60 deletions.
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ The changes should be grouped using the following categories (in order of preced

## [Unreleased](https://github.com/sapcc/absent-metrics-operator/compare/v0.9.4...HEAD)

### Added

- New `prom-rule-name` flag which can be used to provide a template for AbsencePrometheusRule name generation and consequently absence alert rules aggregation.
- Improved tests by adding dedicated unit tests for alert rule parsing and name generation edge-cases.

### Fixed

- Clean up of absence alert rules when a rule group is deleted.

### Removed

- Heuristic determination of `tier`, `service`, and `support_group` labels. These labels will now be copied over as is from the original alert rule to its corresponding absence alert rule.
Expand Down Expand Up @@ -67,7 +76,7 @@ The changes should be grouped using the following categories (in order of preced

### Fixed

- `-debug` flag.
- `debug` flag.

## 0.9.0 - 2022-11-02

Expand Down
136 changes: 90 additions & 46 deletions controllers/absence_prometheusrule.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
package controllers

import (
"bytes"
"context"
"errors"
"fmt"
"reflect"
"sort"
"strings"
"text/template"
"time"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand All @@ -30,37 +32,76 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const absencePromRuleNameSuffix = "-absent-metric-alert-rules"
const (
absencePromRuleNameSuffix = "-absent-metric-alert-rules"
DefaultAbsencePromRuleNameTemplate = `{{ if index .metadata.labels "thanos-ruler" }}{{ index .metadata.labels "thanos-ruler" }}{{ else }}{{ index .metadata.labels "prometheus" }}{{ end }}`
)

// AbsencePromRuleNameGenerator is a function type that takes a PrometheusRule and
// generates a name for its corresponding PrometheusRule that holds the generated absence
// alert rules.
type AbsencePromRuleNameGenerator func(*monitoringv1.PrometheusRule) (string, error)

// CreateAbsencePromRuleNameGenerator creates an absencePromRuleNameGenerator function
// based on a template string.
func CreateAbsencePromRuleNameGenerator(tmplStr string) (AbsencePromRuleNameGenerator, error) {
t, err := template.New("promRuleNameGenerator").Option("missingkey=error").Parse(tmplStr)
if err != nil {
return nil, err
}

return func(pr *monitoringv1.PrometheusRule) (string, error) {
// only a specific vetted subset of attributes is passed into the name template to avoid surprising behavior
data := map[string]any{
"metadata": map[string]any{
"annotations": pr.ObjectMeta.Annotations,
"labels": pr.ObjectMeta.Labels,
"namespace": pr.ObjectMeta.Namespace,
"name": pr.ObjectMeta.Name,
},
}

var buf bytes.Buffer
err = t.Execute(&buf, data)
if err != nil {
return "", fmt.Errorf("could not generate AbsencePrometheusRule name: %w", err)
}

// AbsencePrometheusRuleName returns the name of an AbsencePrometheusRule resource that
// holds the absence alert rules concerning a specific Prometheus server (e.g. openstack, kubernetes, etc.).
func AbsencePrometheusRuleName(promServer string) string {
return fmt.Sprintf("%s%s", promServer, absencePromRuleNameSuffix)
return buf.String() + absencePromRuleNameSuffix, nil
}, nil
}

func (r *PrometheusRuleReconciler) newAbsencePrometheusRule(namespace, promServer string) *monitoringv1.PrometheusRule {
func (r *PrometheusRuleReconciler) newAbsencePrometheusRule(name, namespace string, labels map[string]string) *monitoringv1.PrometheusRule {
l := map[string]string{
// Add a label that identifies that this PrometheusRule resource is
// created and managed by this operator.
labelOperatorManagedBy: "true",
"type": "alerting-rules",
}
// Carry over labels from source PrometheusRule object if needed.
if v, ok := labels[labelPrometheusServer]; ok {
l[labelPrometheusServer] = v
}
if v, ok := labels[labelThanosRuler]; ok {
l[labelThanosRuler] = v
}

return &monitoringv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: AbsencePrometheusRuleName(promServer),
Name: name,
Namespace: namespace,
Labels: map[string]string{
// Add a label that identifies that this PrometheusRule resource is
// created and managed by this operator.
labelOperatorManagedBy: "true",
labelPrometheusServer: promServer,
"type": "alerting-rules",
},
Labels: l,
},
}
}

func (r *PrometheusRuleReconciler) getExistingAbsencePrometheusRule(
ctx context.Context,
namespace, promServer string,
name, namespace string,
) (*monitoringv1.PrometheusRule, error) {

var absencePromRule monitoringv1.PrometheusRule
nsName := types.NamespacedName{Namespace: namespace, Name: AbsencePrometheusRuleName(promServer)}
nsName := types.NamespacedName{Namespace: namespace, Name: name}
if err := r.Get(ctx, nsName, &absencePromRule); err != nil {
return nil, err
}
Expand Down Expand Up @@ -134,21 +175,21 @@ var errCorrespondingAbsencePromRuleNotExists = errors.New("corresponding Absence
func (r *PrometheusRuleReconciler) cleanUpOrphanedAbsenceAlertRules(
ctx context.Context,
promRule types.NamespacedName,
promServer string,
absencePromRule string,
) error {

// Step 1: find the corresponding AbsencePrometheusRule that needs to be cleaned up.
var aPRToClean *monitoringv1.PrometheusRule
if promServer != "" {
if absencePromRule != "" {
var err error
if aPRToClean, err = r.getExistingAbsencePrometheusRule(ctx, promRule.Namespace, promServer); err != nil {
if aPRToClean, err = r.getExistingAbsencePrometheusRule(ctx, absencePromRule, promRule.Namespace); err != nil {
return err
}
} else {
// Since we don't know the Prometheus server for this PrometheusRule therefore we
// have to list all AbsencePrometheusRules in its namespace and find the specific
// AbsencePrometheusRule that contains the absence alert rules that were generated
// for this PrometheusRule.
// Since we don't know the corresponding AbsencePrometheusRule for this PrometheusRule
// therefore we have to list all AbsencePrometheusRules in the concerning namespace and
// find the specific AbsencePrometheusRule that contains the absence alert rules that
// were generated for this PrometheusRule.
var listOpts client.ListOptions
client.InNamespace(promRule.Namespace).ApplyToList(&listOpts)
client.HasLabels{labelOperatorManagedBy}.ApplyToList(&listOpts)
Expand Down Expand Up @@ -201,23 +242,30 @@ func (r *PrometheusRuleReconciler) cleanUpOrphanedAbsenceAlertRules(
// has the 'absent-metrics-operator/disable' label. If such rules are found then they are
// deleted.
func (r *PrometheusRuleReconciler) cleanUpAbsencePrometheusRule(ctx context.Context, absencePromRule *monitoringv1.PrometheusRule) error {
// Step 1: get names of all PrometheusRule resources in this namespace for the
// concerning Prometheus server.
// Step 1: get names of all PrometheusRule resources in this namespace.
var listOpts client.ListOptions
client.InNamespace(absencePromRule.GetNamespace()).ApplyToList(&listOpts)
client.MatchingLabels{
labelPrometheusServer: absencePromRule.Labels[labelPrometheusServer],
}.ApplyToList(&listOpts)
var promRules monitoringv1.PrometheusRuleList
if err := r.List(ctx, &promRules, &listOpts); err != nil {
return err
}

// Step 2: collect names of those PrometheusRule resources whose absence alert rules
// would end up in this AbsencePrometheusRule as per the name generation template.
aPRName := absencePromRule.GetName()
prNames := make(map[string]bool)
for _, pr := range promRules.Items {
prNames[pr.GetName()] = true
if _, ok := pr.Labels[labelOperatorManagedBy]; ok {
continue
}
if n, err := r.PrometheusRuleName(pr); err == nil {
if n == aPRName {
prNames[pr.GetName()] = true
}
}
}

// Step 2: iterate through all the AbsencePrometheusRule's RuleGroups and remove those
// Step 4: iterate through all the AbsencePrometheusRule's RuleGroups and remove those
// that don't belong to any PrometheusRule.
newRuleGroups := make([]monitoringv1.RuleGroup, 0, len(absencePromRule.Spec.Groups))
for _, g := range absencePromRule.Spec.Groups {
Expand All @@ -231,7 +279,7 @@ func (r *PrometheusRuleReconciler) cleanUpAbsencePrometheusRule(ctx context.Cont
return nil
}

// Step 3: if, after the cleanup, the AbsencePrometheusRule ends up being empty then
// Step 5: if, after the cleanup, the AbsencePrometheusRule ends up being empty then
// delete it otherwise update.
if len(newRuleGroups) == 0 {
return r.deleteAbsencePrometheusRule(ctx, absencePromRule)
Expand All @@ -248,22 +296,18 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,
namespace := promRule.GetNamespace()
log := r.Log.WithValues("name", promRuleName, "namespace", namespace)

// Step 1: find the Prometheus server for this resource.
promRuleLabels := promRule.GetLabels()
promServer, ok := promRuleLabels["prometheus"]
if !ok {
// Normally this shouldn't happen but just in case that it does.
return errors.New("no 'prometheus' label found")
}

// Step 2: get the corresponding AbsencePrometheusRule if it exists.
// Step 1: get the corresponding AbsencePrometheusRule if it exists.
existingAbsencePrometheusRule := false
absencePromRule, err := r.getExistingAbsencePrometheusRule(ctx, namespace, promServer)
aPRName, err := r.PrometheusRuleName(promRule)
if err != nil {
return err
}
absencePromRule, err := r.getExistingAbsencePrometheusRule(ctx, aPRName, namespace)
switch {
case err == nil:
existingAbsencePrometheusRule = true
case apierrors.IsNotFound(err):
absencePromRule = r.newAbsencePrometheusRule(namespace, promServer)
absencePromRule = r.newAbsencePrometheusRule(aPRName, namespace, promRule.GetLabels())
default:
// This could have been caused by a temporary network failure, or any
// other transient reason.
Expand All @@ -283,25 +327,25 @@ func (r *PrometheusRuleReconciler) updateAbsenceAlertRules(ctx context.Context,
delete(absencePromRule.Labels, LabelTier)
}

// Step 3: parse RuleGroups and generate corresponding absence alert rules.
// Step 2: parse RuleGroups and generate corresponding absence alert rules.
absenceRuleGroups, err := ParseRuleGroups(log, promRule.Spec.Groups, promRuleName, r.KeepLabel)
if err != nil {
return err
}

// Step 4: we clean up orphaned absence alert rules from the AbsencePrometheusRule in
// Step 3: we clean up orphaned absence alert rules from the AbsencePrometheusRule in
// case no absence alert rules were generated.
// This can happen when changes have been made to alert rules that result in no absent
// alerts. E.g. absent() or the 'no_alert_on_absence' label was used.
if len(absenceRuleGroups) == 0 {
if existingAbsencePrometheusRule {
key := types.NamespacedName{Namespace: namespace, Name: promRuleName}
return r.cleanUpOrphanedAbsenceAlertRules(ctx, key, promServer)
return r.cleanUpOrphanedAbsenceAlertRules(ctx, key, aPRName)
}
return nil
}

// Step 5: if it's an existing AbsencePrometheusRule then update otherwise create a new resource.
// Step 4: if it's an existing AbsencePrometheusRule then update otherwise create a new resource.
if existingAbsencePrometheusRule {
existingRuleGroups := unmodifiedAbsencePromRule.Spec.Groups
result := mergeAbsenceRuleGroups(promRuleName, existingRuleGroups, absenceRuleGroups)
Expand Down
79 changes: 79 additions & 0 deletions controllers/absence_prometheusrule_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2024 SAP SE
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package controllers

import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var _ = Describe("AbsencePrometheusRule", func() {
pr := &monitoringv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: "foobar.alerts",
Namespace: "outerspace",
Labels: map[string]string{
"prometheus": "openstack",
"thanos-ruler": "titan",
},
},
}

DescribeTable("Name generation",
func(tmplStr, expected string, shouldFail bool) {
gen, err := CreateAbsencePromRuleNameGenerator(tmplStr)
Expect(err).ToNot(HaveOccurred())
actual, err := gen(pr)
if shouldFail {
Expect(err).To(HaveOccurred())
return
}
Expect(err).ToNot(HaveOccurred())
Expect(actual).To(Equal(expected))
},
Entry("name that uses the original name",
`{{ .metadata.name }}`,
"foobar.alerts"+absencePromRuleNameSuffix,
false,
),
Entry("name that uses the namespace",
`{{ .metadata.namespace }}`,
"outerspace"+absencePromRuleNameSuffix,
false,
),
Entry("name that uses the original name and namespace",
`{{ .metadata.name }}-{{ .metadata.namespace }}`,
"foobar.alerts-outerspace"+absencePromRuleNameSuffix,
false,
),
Entry("name with prometheus label",
`{{ .metadata.labels.prometheus }}`,
"openstack"+absencePromRuleNameSuffix,
false,
),
Entry("name with thanos-ruler label if it exists",
DefaultAbsencePromRuleNameTemplate,
"titan"+absencePromRuleNameSuffix,
false,
),
Entry("name that references nonexistent metadata",
`{{ .metadata.doesntexist }}`,
"",
true,
),
)
})
1 change: 1 addition & 0 deletions controllers/labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (

labelNoAlertOnAbsence = "no_alert_on_absence"
labelPrometheusServer = "prometheus"
labelThanosRuler = "thanos-ruler"
)

// KeepLabel specifies which labels to keep on an absence alert rule.
Expand Down
6 changes: 5 additions & 1 deletion controllers/prometheusrule_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ type PrometheusRuleReconciler struct {
Scheme *runtime.Scheme
Log logr.Logger

PrometheusRuleName AbsencePromRuleNameGenerator
// KeepLabel is a map of labels that will be retained from the original alert rule and
// passed on to its corresponding absence alert rule.
KeepLabel KeepLabel
Expand Down Expand Up @@ -176,7 +177,10 @@ func (r *PrometheusRuleReconciler) reconcileObject(
// elapsed).
if parseBool(l[labelOperatorDisable]) {
log.V(logLevelDebug).Info("operator disabled for this PrometheusRule")
err := r.cleanUpOrphanedAbsenceAlertRules(ctx, key, l[labelPrometheusServer])
aPRName, err := r.PrometheusRuleName(obj)
if err == nil {
err = r.cleanUpOrphanedAbsenceAlertRules(ctx, key, aPRName)
}
if err != nil {
if !apierrors.IsNotFound(err) && !errors.Is(err, errCorrespondingAbsencePromRuleNotExists) {
log.Error(err, "could not clean up orphaned absence alert rules")
Expand Down
Loading

0 comments on commit ab73636

Please sign in to comment.