Skip to content

Commit

Permalink
feat: update alertmanager flags at runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
TheSpiritXIII committed Aug 19, 2024
1 parent 564db9f commit faaeb81
Show file tree
Hide file tree
Showing 13 changed files with 3,063 additions and 22 deletions.
4 changes: 2 additions & 2 deletions charts/values.global.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ images:
image: gke.gcr.io/gke-distroless/bash
tag: "gke_distroless_20240607.00_p0" # NOTE: Has to be quoted otherwise it will be treated as a number.
alertmanager:
image: gke.gcr.io/prometheus-engine/alertmanager@sha256
tag: "4311da6164f66c4097878c023d4aa5ab908641414e087ba4d5eb29b6126158bc"
image: gke.gcr.io/prometheus-engine/alertmanager
tag: "v0.25.1-gmp.8-gke.0"
prometheus:
# TODO(bwplotka): Change to "v2.45.3-gmp.6-gke.0" once tags are cloned.
image: gke.gcr.io/prometheus-engine/prometheus@sha256
Expand Down
13 changes: 2 additions & 11 deletions e2e/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"context"
"errors"
"fmt"
"strings"
"testing"

"github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator"
Expand Down Expand Up @@ -118,6 +117,8 @@ receivers:
- name: "foobar"
route:
receiver: "foobar"
google_cloud:
external_url: "https://alertmanager.mycompany.com/"
`
secret := corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -181,7 +182,6 @@ route:
return false, fmt.Errorf("unexpected configuration (-want, +got): %s", diff)
}

// Check externalURL was set on statefulset.
ss := appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: operator.NameAlertmanager,
Expand All @@ -200,15 +200,6 @@ route:
if c.Name != operator.AlertmanagerContainerName {
continue
}
// We're mainly interested in the dynamic flags but checking the entire set including
// the static ones is ultimately simpler.
wantArgs := []string{
fmt.Sprintf("--web.external-url=%q", "https://alertmanager.mycompany.com/"),
}

if diff := cmp.Diff(strings.Join(wantArgs, " "), getEnvVar(c.Env, "EXTRA_ARGS")); diff != "" {
return false, fmt.Errorf("unexpected flags (-want, +got): %s", diff)
}
return true, nil
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ require (
github.com/hashicorp/go-cleanhttp v0.5.2
github.com/oklog/run v1.1.0
github.com/oklog/ulid v1.3.1
github.com/prometheus/alertmanager v0.26.0
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.5.0
github.com/prometheus/common v0.47.0
Expand Down Expand Up @@ -108,7 +109,6 @@ require (
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/alertmanager v0.26.0 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
Expand Down
2 changes: 1 addition & 1 deletion manifests/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ spec:
readOnlyRootFilesystem: true
containers:
- name: alertmanager
image: gke.gcr.io/prometheus-engine/alertmanager@sha256:4311da6164f66c4097878c023d4aa5ab908641414e087ba4d5eb29b6126158bc
image: gke.gcr.io/prometheus-engine/alertmanager:v0.25.1-gmp.8-gke.0
args:
- --config.file=/alertmanager/config_out/config.yaml
- --storage.path=/alertmanager-data
Expand Down
6 changes: 6 additions & 0 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ func New(logger logr.Logger, clientConfig *rest.Config, opts Options) (*Operator
"metadata.name": NameRuleEvaluator,
}),
},
&appsv1.StatefulSet{}: {
Field: fields.SelectorFromSet(fields.Set{
"metadata.namespace": opts.OperatorNamespace,
"metadata.name": NameAlertmanager,
}),
},
}

// Determine whether VPA is installed in the cluster. If so, set up the scaling controller.
Expand Down
58 changes: 51 additions & 7 deletions pkg/operator/operator_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

monitoringv1 "github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator/apis/monitoring/v1"
"github.com/go-logr/logr"
alertmanagerconfig "github.com/prometheus/alertmanager/config"
promcommonconfig "github.com/prometheus/common/config"
prommodel "github.com/prometheus/common/model"
promconfig "github.com/prometheus/prometheus/config"
Expand Down Expand Up @@ -399,6 +400,20 @@ func (r *operatorConfigReconciler) ensureAlertmanagerConfigSecret(ctx context.Co
// so that the managed AM pod doesn't crash loop.
logger.Info(fmt.Sprintf("alertmanager config secret not found in namespace %s: %s", pubNamespace, err.Error()))
} else {
config := alertmanagerConfig{}
if err := yaml.Unmarshal(b, &config); err != nil {
return fmt.Errorf("load alertmanager config: %w", err)
}
// Only set the value if we need to. This provides a fail-safe in case users change our
// Alertmanager image with their own. Otherwise, if we always set and they change the image,
// their Alertmanager will fail unless they have our patch.
if config.GoogleCloud.ExternalURL != spec.ExternalURL {
config.GoogleCloud.ExternalURL = spec.ExternalURL
b, err = yaml.Marshal(config)
if err != nil {
return fmt.Errorf("marshal alertmanager config: %w", err)
}
}
secret.Data[AlertmanagerConfigKey] = b
}

Expand All @@ -413,6 +428,35 @@ func (r *operatorConfigReconciler) ensureAlertmanagerConfigSecret(ctx context.Co
return nil
}

type alertmanagerConfig struct {
alertmanagerconfig.Config `yaml:",inline"`

// Google Cloud configuration. Matches our fork's configuration.
GoogleCloud googleCloudAlertmanagerConfig `yaml:"google_cloud,omitempty"`
}

type googleCloudAlertmanagerConfig struct {
ExternalURL string `yaml:"external_url,omitempty"`
}

func (config *alertmanagerConfig) UnmarshalYAML(value *yaml.Node) error {
// See: https://github.com/go-yaml/yaml/issues/125
// Since the Prometheus configuration uses a custom unmarshaler, it is unable to be
// unmarshal-ed unless we write our own.
if err := value.Decode(&config.Config); err != nil {
return err
}
// We must replicate the nested fields.
googleCloudConfig := struct {
GoogleCloud googleCloudAlertmanagerConfig `yaml:"google_cloud,omitempty"`
}{}
if err := value.Decode(&googleCloudConfig); err != nil {
return err
}
config.GoogleCloud = googleCloudConfig.GoogleCloud
return nil
}

// setContainerExtraArgs updates EXTRA_ARG environment variable in a given
// container. This is a pattern, only our binaries use to be able to read dynamic
// flags. See e.g.
Expand Down Expand Up @@ -457,14 +501,14 @@ func (r *operatorConfigReconciler) ensureAlertmanagerStatefulSet(ctx context.Con
return err
}

var flags []string
if externalURL := spec.ExternalURL; externalURL != "" {
flags = append(flags, fmt.Sprintf("--web.external-url=%q", externalURL))
}
setContainerExtraArgs(sset.Spec.Template.Spec.Containers, AlertmanagerContainerName, strings.Join(flags, " "))
setContainerExtraArgs(sset.Spec.Template.Spec.Containers, AlertmanagerContainerName, "")

// Upsert alertmanager StatefulSet.
return r.client.Update(ctx, &sset)
// Support not having UPDATE permission. We will remove it in the future.
// See: https://github.com/GoogleCloudPlatform/prometheus-engine/pull/1080
if err := r.client.Update(ctx, &sset); !apierrors.IsForbidden(err) {
return err
}
return nil
}

// ensureRuleEvaluatorDeployment reconciles the Deployment for rule-evaluator.
Expand Down
Loading

0 comments on commit faaeb81

Please sign in to comment.