Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update alertmanager flags at runtime #1074

Merged
merged 1 commit into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions charts/values.global.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ images:
image: gke.gcr.io/gke-distroless/bash
tag: "gke_distroless_20240607.00_p0" # NOTE: Has to be quoted otherwise it will be treated as a number.
alertmanager:
image: gke.gcr.io/prometheus-engine/alertmanager@sha256
tag: "4311da6164f66c4097878c023d4aa5ab908641414e087ba4d5eb29b6126158bc"
image: gke.gcr.io/prometheus-engine/alertmanager
tag: "v0.25.1-gmp.8-gke.0"
prometheus:
# TODO(bwplotka): Change to "v2.45.3-gmp.6-gke.0" once tags are cloned.
image: gke.gcr.io/prometheus-engine/prometheus@sha256
Expand Down
13 changes: 2 additions & 11 deletions e2e/alertmanager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
"context"
"errors"
"fmt"
"strings"
"testing"

"github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator"
Expand Down Expand Up @@ -118,6 +117,8 @@ receivers:
- name: "foobar"
route:
receiver: "foobar"
google_cloud:
external_url: "https://alertmanager.mycompany.com/"
`
secret := corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -181,7 +182,6 @@ route:
return false, fmt.Errorf("unexpected configuration (-want, +got): %s", diff)
}

// Check externalURL was set on statefulset.
ss := appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: operator.NameAlertmanager,
Expand All @@ -200,15 +200,6 @@ route:
if c.Name != operator.AlertmanagerContainerName {
continue
}
// We're mainly interested in the dynamic flags but checking the entire set including
// the static ones is ultimately simpler.
wantArgs := []string{
fmt.Sprintf("--web.external-url=%q", "https://alertmanager.mycompany.com/"),
}

if diff := cmp.Diff(strings.Join(wantArgs, " "), getEnvVar(c.Env, "EXTRA_ARGS")); diff != "" {
return false, fmt.Errorf("unexpected flags (-want, +got): %s", diff)
}
return true, nil
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ require (
github.com/hashicorp/go-cleanhttp v0.5.2
github.com/oklog/run v1.1.0
github.com/oklog/ulid v1.3.1
github.com/prometheus/alertmanager v0.26.0
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.5.0
github.com/prometheus/common v0.47.0
Expand Down Expand Up @@ -108,7 +109,6 @@ require (
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/alertmanager v0.26.0 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
Expand Down
2 changes: 1 addition & 1 deletion manifests/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ spec:
readOnlyRootFilesystem: true
containers:
- name: alertmanager
image: gke.gcr.io/prometheus-engine/alertmanager@sha256:4311da6164f66c4097878c023d4aa5ab908641414e087ba4d5eb29b6126158bc
image: gke.gcr.io/prometheus-engine/alertmanager:v0.25.1-gmp.8-gke.0
args:
- --config.file=/alertmanager/config_out/config.yaml
- --storage.path=/alertmanager-data
Expand Down
6 changes: 6 additions & 0 deletions pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ func New(logger logr.Logger, clientConfig *rest.Config, opts Options) (*Operator
"metadata.name": NameRuleEvaluator,
}),
},
&appsv1.StatefulSet{}: {
Field: fields.SelectorFromSet(fields.Set{
"metadata.namespace": opts.OperatorNamespace,
"metadata.name": NameAlertmanager,
}),
},
}

// Determine whether VPA is installed in the cluster. If so, set up the scaling controller.
Expand Down
58 changes: 51 additions & 7 deletions pkg/operator/operator_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

monitoringv1 "github.com/GoogleCloudPlatform/prometheus-engine/pkg/operator/apis/monitoring/v1"
"github.com/go-logr/logr"
alertmanagerconfig "github.com/prometheus/alertmanager/config"
promcommonconfig "github.com/prometheus/common/config"
prommodel "github.com/prometheus/common/model"
promconfig "github.com/prometheus/prometheus/config"
Expand Down Expand Up @@ -399,6 +400,20 @@ func (r *operatorConfigReconciler) ensureAlertmanagerConfigSecret(ctx context.Co
// so that the managed AM pod doesn't crash loop.
logger.Info(fmt.Sprintf("alertmanager config secret not found in namespace %s: %s", pubNamespace, err.Error()))
} else {
config := alertmanagerConfig{}
if err := yaml.Unmarshal(b, &config); err != nil {
return fmt.Errorf("load alertmanager config: %w", err)
}
// Only set the value if we need to. This provides a fail-safe in case users change our
// Alertmanager image with their own. Otherwise, if we always set and they change the image,
// their Alertmanager will fail unless they have our patch.
if config.GoogleCloud.ExternalURL != spec.ExternalURL {
config.GoogleCloud.ExternalURL = spec.ExternalURL
b, err = yaml.Marshal(config)
if err != nil {
return fmt.Errorf("marshal alertmanager config: %w", err)
}
}
secret.Data[AlertmanagerConfigKey] = b
}

Expand All @@ -413,6 +428,35 @@ func (r *operatorConfigReconciler) ensureAlertmanagerConfigSecret(ctx context.Co
return nil
}

type alertmanagerConfig struct {
alertmanagerconfig.Config `yaml:",inline"`

// Google Cloud configuration. Matches our fork's configuration.
GoogleCloud googleCloudAlertmanagerConfig `yaml:"google_cloud,omitempty"`
}

type googleCloudAlertmanagerConfig struct {
ExternalURL string `yaml:"external_url,omitempty"`
}

func (config *alertmanagerConfig) UnmarshalYAML(value *yaml.Node) error {
// See: https://github.com/go-yaml/yaml/issues/125
// Since the Prometheus configuration uses a custom unmarshaler, it is unable to be
// unmarshal-ed unless we write our own.
if err := value.Decode(&config.Config); err != nil {
return err
}
// We must replicate the nested fields.
googleCloudConfig := struct {
GoogleCloud googleCloudAlertmanagerConfig `yaml:"google_cloud,omitempty"`
}{}
if err := value.Decode(&googleCloudConfig); err != nil {
return err
}
config.GoogleCloud = googleCloudConfig.GoogleCloud
return nil
}

// setContainerExtraArgs updates EXTRA_ARG environment variable in a given
// container. This is a pattern, only our binaries use to be able to read dynamic
// flags. See e.g.
Expand Down Expand Up @@ -457,14 +501,14 @@ func (r *operatorConfigReconciler) ensureAlertmanagerStatefulSet(ctx context.Con
return err
}

var flags []string
if externalURL := spec.ExternalURL; externalURL != "" {
flags = append(flags, fmt.Sprintf("--web.external-url=%q", externalURL))
}
setContainerExtraArgs(sset.Spec.Template.Spec.Containers, AlertmanagerContainerName, strings.Join(flags, " "))
setContainerExtraArgs(sset.Spec.Template.Spec.Containers, AlertmanagerContainerName, "")

// Upsert alertmanager StatefulSet.
return r.client.Update(ctx, &sset)
// Support not having UPDATE permission. We will remove it in the future.
// See: https://github.com/GoogleCloudPlatform/prometheus-engine/pull/1080
if err := r.client.Update(ctx, &sset); !apierrors.IsForbidden(err) {
return err
}
return nil
}

// ensureRuleEvaluatorDeployment reconciles the Deployment for rule-evaluator.
Expand Down
Loading
Loading