Skip to content

Commit

Permalink
correctly track existence of rules managed by opni alerting
Browse files Browse the repository at this point in the history
  • Loading branch information
alexandreLamarre committed Oct 27, 2023
1 parent 09448f2 commit 6323bd1
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 39 deletions.
19 changes: 16 additions & 3 deletions pkg/alerting/drivers/cortex/cortex.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/prometheus/prometheus/model/rulefmt"
"github.com/rancher/opni/pkg/alerting/message"
"github.com/rancher/opni/pkg/alerting/metrics"
"github.com/rancher/opni/pkg/alerting/shared"
alertingv1 "github.com/rancher/opni/pkg/apis/alerting/v1"
"github.com/samber/lo"
"gopkg.in/yaml.v3"
Expand All @@ -19,6 +20,12 @@ Contains the struct/function adapters required for opni alerting to
communicate with cortex.
*/

const (
MetadataCortexNamespace = "opni.io/cortex-rule-namespace"
MetadataCortexGroup = "opni.io/cortex-rule-group"
MetadataCortexRuleName = "opni.io/cortex-rule-name"
)

const alertingSuffix = "-opni-alerting"

// this enforces whatever default the remote prometheus instance has
Expand Down Expand Up @@ -58,11 +65,11 @@ func NewPrometheusAlertingRule(
info alertingv1.IndexableMetric,
interval *time.Duration,
rule metrics.AlertRuleBuilder,
) (*rulefmt.RuleGroup, error) {
) (ruleGroup *rulefmt.RuleGroup, metadata map[string]string, err error) {
idLabels := ConstructIdLabelsForRecordingRule(alertId)
alertingRule, err := rule.Build(alertId)
if err != nil {
return nil, err
return nil, nil, err
}
recordingRuleFmt := &rulefmt.RuleNode{
Record: yaml.Node{
Expand All @@ -88,9 +95,15 @@ func NewPrometheusAlertingRule(
promInterval = prommodel.Duration(*interval)
}

return &rulefmt.RuleGroup{
rg := &rulefmt.RuleGroup{
Name: RuleIdFromUuid(alertId),
Interval: promInterval,
Rules: []rulefmt.RuleNode{*alertingRule, *recordingRuleFmt},
}

return rg, map[string]string{
MetadataCortexNamespace: shared.OpniAlertingCortexNamespace,
MetadataCortexGroup: rg.Name,
MetadataCortexRuleName: alertingRule.Alert.Value,
}, nil
}
15 changes: 13 additions & 2 deletions plugins/alerting/pkg/alerting/alarms/v1/rules_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ import (
"github.com/rancher/opni/pkg/auth/cluster"
"github.com/rancher/opni/pkg/util"
"github.com/rancher/opni/plugins/alerting/pkg/apis/rules"
"github.com/samber/lo"
"go.uber.org/multierr"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/testing/protocmp"
"google.golang.org/protobuf/types/known/emptypb"
"google.golang.org/protobuf/types/known/timestamppb"
Expand Down Expand Up @@ -51,7 +54,6 @@ func areRuleSpecsEqual(old, new *alertingv1.AlertCondition) bool {
ignoreOpniConfigurations(oldIgnoreOpniConf)
ignoreOpniConfigurations(newIgnoreOpniConf)
return cmp.Equal(oldIgnoreOpniConf, newIgnoreOpniConf, protocmp.Transform())

}

func (a *AlarmServerComponent) SyncRules(ctx context.Context, rules *rules.RuleManifest) (*emptypb.Empty, error) {
Expand Down Expand Up @@ -89,13 +91,22 @@ func (a *AlarmServerComponent) SyncRules(ctx context.Context, rules *rules.RuleM

existing, err := condStorage.Group(rule.GetGroupId().Id).Get(ctx, rule.GetRuleId().Id)
if err == nil {
metadata := existing.GetMetadata()
if metadata == nil {
metadata = map[string]string{}
}
// keep opni managed metadata, unless overriden
retMetadata := lo.Assign(metadata, incomingCond.Metadata)
if !areRuleSpecsEqual(existing, incomingCond) {
applyMutableReadOnlyFields(incomingCond, existing)
incomingCond.Metadata = retMetadata
if err := condStorage.Group(rule.GroupId.Id).Put(ctx, rule.RuleId.Id, incomingCond); err != nil {
errors = append(errors, err)
}
}
} else {
}

if st, ok := status.FromError(err); ok && st.Code() == codes.NotFound {
if err := condStorage.Group(rule.GroupId.Id).Put(ctx, rule.RuleId.Id, incomingCond); err != nil {
errors = append(errors, err)
}
Expand Down
25 changes: 18 additions & 7 deletions plugins/alerting/pkg/alerting/alarms/v1/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ import (
alertingv1 "github.com/rancher/opni/pkg/apis/alerting/v1"
corev1 "github.com/rancher/opni/pkg/apis/core/v1"
"github.com/rancher/opni/plugins/metrics/apis/cortexadmin"
"github.com/samber/lo"
"gopkg.in/yaml.v3"
)

const (
metadataLastAppliedHashKey = "opni.io/alarm-hash"
metadataInactiveAlarm = "opni.io/alarm-inactive"
// this metadata key indicates this has yet to be activated in a remote backend
metadataInactiveAlarm = "opni.io/alarm-inactive"
)

func (p *AlarmServerComponent) shouldDelete(
Expand Down Expand Up @@ -106,6 +108,8 @@ func (p *AlarmServerComponent) activateCondition(
cond *alertingv1.AlertCondition,
conditionId string,
) (ref *corev1.Reference, retErr error) {
lg := p.logger.With("condition", cond.GetName(), "id", cond.GetId(), "cond-group", cond.GroupId)
lg.Info("activating alarm")
conditionStorage, err := p.conditionStorage.GetContext(ctx)
if err != nil {
return nil, err
Expand All @@ -128,6 +132,8 @@ func (p *AlarmServerComponent) activateCondition(
delete(md, metadataInactiveAlarm)
cond.Metadata = md
retErr = conditionStorage.Group(cond.GroupId).Put(ctx, conditionId, cond)
} else {
lg.Error("failed to activate alarm")
}
}()

Expand Down Expand Up @@ -215,14 +221,16 @@ func (p *AlarmServerComponent) handleKubeAlertCreation(ctx context.Context, cond
if err != nil {
return err
}
kubeRuleContent, err := cortex.NewPrometheusAlertingRule(newId, alertName,
kubeRuleContent, md, err := cortex.NewPrometheusAlertingRule(newId, alertName,
cond.GetRoutingLabels(),
cond.GetRoutingAnnotations(),
k, nil, baseKubeRule,
)
if err != nil {
return err
}

cond.Metadata = lo.Assign(cond.Metadata, md)
out, err := yaml.Marshal(kubeRuleContent)
if err != nil {
return err
Expand Down Expand Up @@ -260,13 +268,14 @@ func (p *AlarmServerComponent) handleCpuSaturationAlertCreation(
if err != nil {
return err
}
cpuRuleContent, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
cpuRuleContent, md, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
cond.GetRoutingLabels(),
cond.GetRoutingAnnotations(),
c, nil, baseCpuRule)
if err != nil {
return err
}
cond.Metadata = lo.Assign(cond.Metadata, md)
out, err := yaml.Marshal(cpuRuleContent)
if err != nil {
return err
Expand Down Expand Up @@ -297,7 +306,7 @@ func (p *AlarmServerComponent) handleMemorySaturationAlertCreation(ctx context.C
if err != nil {
return err
}
memRuleContent, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
memRuleContent, md, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
cond.GetRoutingLabels(),
cond.GetRoutingAnnotations(),
m,
Expand All @@ -307,6 +316,7 @@ func (p *AlarmServerComponent) handleMemorySaturationAlertCreation(ctx context.C
if err != nil {
return err
}
cond.Metadata = lo.Assign(cond.Metadata, md)

out, err := yaml.Marshal(memRuleContent)
if err != nil {
Expand Down Expand Up @@ -336,7 +346,7 @@ func (p *AlarmServerComponent) handleFsSaturationAlertCreation(ctx context.Conte
if err != nil {
return err
}
fsRuleContent, err := cortex.NewPrometheusAlertingRule(
fsRuleContent, md, err := cortex.NewPrometheusAlertingRule(
conditionId,
alertName,
cond.GetRoutingLabels(),
Expand All @@ -348,7 +358,7 @@ func (p *AlarmServerComponent) handleFsSaturationAlertCreation(ctx context.Conte
if err != nil {
return err
}

cond.Metadata = lo.Assign(cond.Metadata, md)
out, err := yaml.Marshal(fsRuleContent)
if err != nil {
return err
Expand Down Expand Up @@ -376,13 +386,14 @@ func (p *AlarmServerComponent) handlePrometheusQueryAlertCreation(ctx context.Co
Annotations: map[string]string{},
}

baseRuleContent, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
baseRuleContent, md, err := cortex.NewPrometheusAlertingRule(conditionId, alertName,
cond.GetRoutingLabels(),
cond.GetRoutingAnnotations(),
q, nil, baseRule)
if err != nil {
return err
}
cond.Metadata = lo.Assign(cond.Metadata, md)
var out bytes.Buffer
encoder := yaml.NewEncoder(&out)
err = encoder.Encode(baseRuleContent)
Expand Down
94 changes: 67 additions & 27 deletions plugins/alerting/pkg/alerting/alarms/v1/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ package alarms

import (
"context"
"strings"
"sync"
"time"

"github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/alertmanager/pkg/labels"
promClient "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/rancher/opni/pkg/alerting/drivers/cortex"
"github.com/rancher/opni/pkg/alerting/drivers/routing"
"github.com/rancher/opni/pkg/alerting/shared"
alertingv1 "github.com/rancher/opni/pkg/apis/alerting/v1"
Expand Down Expand Up @@ -83,7 +83,7 @@ func (a *AlarmServerComponent) checkMetricsClusterStatus(
Reason: "cluster does not have metrics capabilities installed",
}
}
if status := evaluatePrometheusRuleHealth(metricsInfo.cortexRules, cond.GetId()); status != nil {
if status := a.evaluatePrometheusRuleHealth(cond, metricsInfo.cortexRules); status != nil {
return status
}
return &alertingv1.AlertStatusResponse{
Expand Down Expand Up @@ -372,44 +372,84 @@ func statusFromAlertGroup(
return defaultState
}

func evaluatePrometheusRuleHealth(ruleList *cortexadmin.RuleGroups, id string) *alertingv1.AlertStatusResponse {
func (a *AlarmServerComponent) evaluatePrometheusRuleHealth(cond *alertingv1.AlertCondition, ruleList *cortexadmin.RuleGroups) *alertingv1.AlertStatusResponse {
if ruleList == nil {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "waiting for monitoring rule state(s) to be available from metrics backend",
}
}
insufficientMetadata := &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "insufficient metadata required to reference remote rule",
}
md := cond.GetMetadata()
if md == nil {
return insufficientMetadata
}
groupName, ok := md[cortex.MetadataCortexGroup]
if !ok {
return insufficientMetadata
}

ruleName, ok := md[cortex.MetadataCortexRuleName]
if !ok {
return insufficientMetadata
}

for _, group := range ruleList.GetGroups() {
if strings.Contains(group.GetName(), id) {
if len(group.GetRules()) == 0 {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "waiting for monitoring rule state(s) to be available from metrics backend",
}
if group.GetName() != groupName {
continue
}
if len(group.GetRules()) == 0 {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "waiting for monitoring rule state(s) to be available from metrics backend",
}
healthList := lo.Map(group.GetRules(), func(rule *cortexadmin.Rule, _ int) string {
return rule.GetHealth()
})
health := lo.Associate(healthList, func(health string) (string, struct{}) {
return health, struct{}{}
})
if _, ok := health[promClient.RuleHealthBad]; ok {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Invalidated,
Reason: "one or more metric dependencies are unable to be evaluated",
}
}
found := false
for _, rule := range group.GetRules() {
if rule.GetName() == ruleName {
found = true
}
if _, ok := health[promClient.RuleHealthUnknown]; ok {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "waiting for monitoring rule state(s) to be available from metrics backend",
}
}
if !found {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "prometheus alerting rule is not found in metrics backend",
}
}

healthList := lo.Map(group.GetRules(), func(rule *cortexadmin.Rule, _ int) string {
return rule.GetHealth()
})
health := lo.Associate(healthList, func(health string) (string, struct{}) {
return health, struct{}{}
})
if _, ok := health[promClient.RuleHealthBad]; ok {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Invalidated,
Reason: "one or more prometheus rules in this group are unable to be evaluated",
}
}
if _, ok := health[promClient.RuleHealthUnknown]; ok {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "waiting for all prometheus rule state(s) to be available from metrics backend",
}
}
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Ok,
}
}
if _, ok := md[metadataInactiveAlarm]; ok {
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "prometheus rule group has not been created yet",
}
}
return &alertingv1.AlertStatusResponse{
State: alertingv1.AlertConditionState_Pending,
Reason: "prometheus rule is not found in metrics backend",
State: alertingv1.AlertConditionState_Invalidated,
Reason: "prometheus rule group could not be found in metrics backend",
}
}

0 comments on commit 6323bd1

Please sign in to comment.