diff --git a/CHANGELOG.md b/CHANGELOG.md index e9df988ad64..991b4a4b98d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Grafana Mimir +* [FEATURE] Alertmanager: Added `-alertmanager.max-silences-count` and `-alertmanager.max-silence-size-bytes` to set limits on per tenant silences. Disabled by default. #6898 * [CHANGE] Build: `grafana/mimir` docker image is now based on `gcr.io/distroless/static-debian12` image. Alpine-based docker image is still available as `grafana/mimir-alpine`, until Mimir 2.15. #8204 * [CHANGE] Ingester: `/ingester/flush` endpoint is now only allowed to execute only while the ingester is in `Running` state. The 503 status code is returned if the endpoint is called while the ingester is not in `Running` state. #7486 * [CHANGE] Distributor: Include label name in `err-mimir-label-value-too-long` error message: #7740 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index c77a0066876..191ae32a7de 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4052,6 +4052,26 @@ "fieldFlag": "alertmanager.max-config-size-bytes", "fieldType": "int" }, + { + "kind": "field", + "name": "alertmanager_max_silences_count", + "required": false, + "desc": "Maximum number of active and pending silences that a tenant can have at once. 0 = no limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "alertmanager.max-silences-count", + "fieldType": "int" + }, + { + "kind": "field", + "name": "alertmanager_max_silence_size_bytes", + "required": false, + "desc": "Maximum silence size in bytes. 0 = no limit.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "alertmanager.max-silence-size-bytes", + "fieldType": "int" + }, { "kind": "field", "name": "alertmanager_max_templates_count", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 1cbd27d54c6..f28ddcdcd28 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -191,6 +191,10 @@ Usage of ./cmd/mimir/mimir: Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit. -alertmanager.max-recv-msg-size int Maximum size (bytes) of an accepted HTTP request body. (default 104857600) + -alertmanager.max-silence-size-bytes int + Maximum silence size in bytes. 0 = no limit. + -alertmanager.max-silences-count int + Maximum number of active and pending silences that a tenant can have at once. 0 = no limit. -alertmanager.max-template-size-bytes int Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit. -alertmanager.max-templates-count int diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index f5c460fca00..706d98488fc 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -81,6 +81,10 @@ Usage of ./cmd/mimir/mimir: Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit. -alertmanager.max-dispatcher-aggregation-groups int Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit. + -alertmanager.max-silence-size-bytes int + Maximum silence size in bytes. 0 = no limit. + -alertmanager.max-silences-count int + Maximum number of active and pending silences that a tenant can have at once. 0 = no limit. -alertmanager.max-template-size-bytes int Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit. -alertmanager.max-templates-count int diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 54141ddf7cb..32fdf886a78 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -3440,6 +3440,15 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -alertmanager.max-config-size-bytes [alertmanager_max_config_size_bytes: | default = 0] +# Maximum number of active and pending silences that a tenant can have at once. +# 0 = no limit. +# CLI flag: -alertmanager.max-silences-count +[alertmanager_max_silences_count: | default = 0] + +# Maximum silence size in bytes. 0 = no limit. +# CLI flag: -alertmanager.max-silence-size-bytes +[alertmanager_max_silence_size_bytes: | default = 0] + # Maximum number of templates in tenant's Alertmanager configuration uploaded # via Alertmanager API. 0 = no limit. # CLI flag: -alertmanager.max-templates-count diff --git a/go.mod b/go.mod index 900cb09c1ed..7ed15c6d2c3 100644 --- a/go.mod +++ b/go.mod @@ -182,7 +182,7 @@ require ( github.com/hashicorp/consul/api v1.28.2 // indirect github.com/hashicorp/errwrap v1.1.0 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect - github.com/hashicorp/go-hclog v1.5.0 // indirect + github.com/hashicorp/go-hclog v1.6.2 // indirect github.com/hashicorp/go-immutable-radix v1.3.1 // indirect github.com/hashicorp/go-msgpack v1.1.5 // indirect github.com/hashicorp/go-multierror v1.1.1 // indirect @@ -284,4 +284,4 @@ replace github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentraci replace github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f // Replacing prometheus/alertmanager with our fork. -replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240524091923-8090d8837b5f +replace github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240531172444-6ad94e405c5a diff --git a/go.sum b/go.sum index bd6ec0201e1..dd7453cd706 100644 --- a/go.sum +++ b/go.sum @@ -521,8 +521,8 @@ github.com/grafana/mimir-prometheus v0.0.0-20240515135245-e5b85c151ba8 h1:XmqfG3 github.com/grafana/mimir-prometheus v0.0.0-20240515135245-e5b85c151ba8/go.mod h1:ZlD3SoAHSwXK5VGLHv78Jh5kOpgSLaQAzt9gxq76fLM= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 h1:em1oddjXL8c1tL0iFdtVtPloq2hRPen2MJQKoAWpxu0= github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956/go.mod h1:qtI1ogk+2JhVPIXVc6q+NHziSmy2W5GbdQZFUHADCBU= -github.com/grafana/prometheus-alertmanager v0.25.1-0.20240524091923-8090d8837b5f h1:EtKg1joztl0yM5tqj51LzzUmiWzPz/5zrYr8Bc7Y5pk= -github.com/grafana/prometheus-alertmanager v0.25.1-0.20240524091923-8090d8837b5f/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc= +github.com/grafana/prometheus-alertmanager v0.25.1-0.20240531172444-6ad94e405c5a h1:0zyw9u1O0PBB0bep9SyfM0sz2Q4XKYuNpTcIGkW3jSk= +github.com/grafana/prometheus-alertmanager v0.25.1-0.20240531172444-6ad94e405c5a/go.mod h1:01sXtHoRwI8W324IPAzuxDFOmALqYLCOhvSC2fUHWXc= github.com/grafana/pyroscope-go/godeltaprof v0.1.6 h1:nEdZ8louGAplSvIJi1HVp7kWvFvdiiYg3COLlTwJiFo= github.com/grafana/pyroscope-go/godeltaprof v0.1.6/go.mod h1:Tk376Nbldo4Cha9RgiU7ik8WKFkNpfds98aUzS8omLE= github.com/grafana/regexp v0.0.0-20240531075221-3685f1377d7b h1:oMAq12GxTpwo9jxbnG/M4F/HdpwbibTaVoxNA0NZprY= @@ -549,8 +549,8 @@ github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrj github.com/hashicorp/go-hclog v0.12.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-hclog v0.16.2/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= github.com/hashicorp/go-hclog v1.2.0/go.mod h1:whpDNt7SSdeAju8AWKIWsul05p54N/39EeqMAyrmvFQ= -github.com/hashicorp/go-hclog v1.5.0 h1:bI2ocEMgcVlz55Oj1xZNBsVi900c7II+fWDyV9o+13c= -github.com/hashicorp/go-hclog v1.5.0/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-hclog v1.6.2 h1:NOtoftovWkDheyUM/8JW3QMiXyxJK3uHRK7wV04nD2I= +github.com/hashicorp/go-hclog v1.6.2/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-immutable-radix v1.3.1 h1:DKHmCUm2hRBK510BaiZlwvpD40f8bJFeZnpfm2KLowc= github.com/hashicorp/go-immutable-radix v1.3.1/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 8372f39b536..5cef1edf12e 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -220,8 +220,12 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.silences, err = silence.New(silence.Options{ SnapshotFile: silencesFile, Retention: cfg.Retention, - Logger: log.With(am.logger, "component", "silences"), - Metrics: am.registry, + Limits: silence.Limits{ + MaxSilences: cfg.Limits.AlertmanagerMaxSilencesCount(cfg.UserID), + MaxPerSilenceBytes: cfg.Limits.AlertmanagerMaxSilenceSizeBytes(cfg.UserID), + }, + Logger: log.With(am.logger, "component", "silences"), + Metrics: am.registry, }) if err != nil { return nil, fmt.Errorf("failed to create silences: %v", err) diff --git a/pkg/alertmanager/alertmanager_test.go b/pkg/alertmanager/alertmanager_test.go index b1db1c7f187..ff04fd3c001 100644 --- a/pkg/alertmanager/alertmanager_test.go +++ b/pkg/alertmanager/alertmanager_test.go @@ -19,6 +19,7 @@ import ( "github.com/prometheus/alertmanager/cluster/clusterpb" "github.com/prometheus/alertmanager/config" "github.com/prometheus/alertmanager/featurecontrol" + "github.com/prometheus/alertmanager/silence/silencepb" "github.com/prometheus/alertmanager/types" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -323,3 +324,86 @@ func testLimiter(t *testing.T, limits Limits, ops []callbackOp) { assert.Equal(t, op.expectedTotalSize, totalSize, "wrong total size, op %d", ix) } } + +func TestSilenceLimits(t *testing.T) { + user := "test" + + r := prometheus.NewPedanticRegistry() + am, err := New(&Config{ + UserID: user, + Logger: log.NewNopLogger(), + Limits: &mockAlertManagerLimits{ + maxSilencesCount: 1, + maxSilenceSizeBytes: 2 << 11, // 4KB, + }, + Features: featurecontrol.NoopFlags{}, + TenantDataDir: t.TempDir(), + ExternalURL: &url.URL{Path: "/am"}, + ShardingEnabled: true, + Store: prepareInMemoryAlertStore(), + Replicator: &stubReplicator{}, + ReplicationFactor: 1, + // We have to set this interval non-zero, though we don't need the persister to do anything. + PersisterConfig: PersisterConfig{Interval: time.Hour}, + }, r) + require.NoError(t, err) + defer am.StopAndWait() + + // Insert sil1 should succeed without error. + sil1 := &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: "a", Pattern: "b"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id1, err := am.silences.Set(sil1) + require.NoError(t, err) + require.NotEqual(t, "", id1) + + // Insert sil2 should fail because maximum number of silences + // has been exceeded. + sil2 := &silencepb.Silence{ + Matchers: []*silencepb.Matcher{{Name: "a", Pattern: "b"}}, + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id2, err := am.silences.Set(sil2) + require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)") + require.Equal(t, "", id2) + + // Expire sil1. This should allow sil2 to be inserted. + require.NoError(t, am.silences.Expire(id1)) + id2, err = am.silences.Set(sil2) + require.NoError(t, err) + require.NotEqual(t, "", id2) + + // Should be able to update sil2 without hitting the limit. + _, err = am.silences.Set(sil2) + require.NoError(t, err) + + // Expire sil2. + require.NoError(t, am.silences.Expire(id2)) + + // Insert sil3 should fail because it exceeds maximum size. + sil3 := &silencepb.Silence{ + Matchers: []*silencepb.Matcher{ + { + Name: strings.Repeat("a", 2<<9), + Pattern: strings.Repeat("b", 2<<9), + }, + { + Name: strings.Repeat("c", 2<<9), + Pattern: strings.Repeat("d", 2<<9), + }, + }, + CreatedBy: strings.Repeat("e", 2<<9), + Comment: strings.Repeat("f", 2<<9), + StartsAt: time.Now(), + EndsAt: time.Now().Add(5 * time.Minute), + } + id3, err := am.silences.Set(sil3) + require.Error(t, err) + // Do not check the exact size as it can change between consecutive runs + // due to padding. + require.Contains(t, err.Error(), "silence exceeded maximum size") + require.Equal(t, "", id3) +} diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index e9e6d9342b4..77b3ea07494 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -218,6 +218,12 @@ type Limits interface { // AlertmanagerMaxConfigSize returns max size of configuration file that user is allowed to upload. If 0, there is no limit. AlertmanagerMaxConfigSize(tenant string) int + // AlertmanagerMaxSilencesCount returns the max number of active and pending silences. If negative or 0, there is no limit. + AlertmanagerMaxSilencesCount(tenant string) int + + // AlertmanagerMaxSilenceSizeBytes returns the max silence size in bytes. If negative or 0, there is no limit. + AlertmanagerMaxSilenceSizeBytes(tenant string) int + // AlertmanagerMaxTemplatesCount returns max number of templates that tenant can use in the configuration. 0 = no limit. AlertmanagerMaxTemplatesCount(tenant string) int diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 8f625eb656c..a8a3005bff8 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -106,6 +106,9 @@ func setupSingleMultitenantAlertmanager(t *testing.T, cfg *MultitenantAlertmanag amCfg, err := ComputeFallbackConfig("") require.NoError(t, err) + if limits == nil { + limits = &mockAlertManagerLimits{} + } am, err := createMultitenantAlertmanager(cfg, amCfg, store, ringStore, limits, features, logger, registerer) require.NoError(t, err) @@ -798,7 +801,7 @@ func TestMultitenantAlertmanager_zoneAwareSharding(t *testing.T) { cfg.ShardingRing.ZoneAwarenessEnabled = true cfg.ShardingRing.InstanceZone = zone - am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) + am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) require.NoError(t, err) t.Cleanup(func() { require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) @@ -870,7 +873,7 @@ func TestMultitenantAlertmanager_deleteUnusedRemoteUserState(t *testing.T) { // Increase state write interval so that state gets written sooner, making test faster. cfg.Persister.Interval = 500 * time.Millisecond - am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) + am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) require.NoError(t, err) t.Cleanup(func() { require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) @@ -966,7 +969,7 @@ func TestMultitenantAlertmanager_deleteUnusedRemoteUserStateDisabled(t *testing. // Disable state cleanup. cfg.EnableStateCleanup = false - am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) + am, err := createMultitenantAlertmanager(cfg, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewLogfmtLogger(os.Stdout), reg) require.NoError(t, err) t.Cleanup(func() { require.NoError(t, services.StopAndAwaitTerminated(ctx, am)) @@ -1337,7 +1340,7 @@ func TestMultitenantAlertmanager_InitialSync(t *testing.T) { })) } - am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) + am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1442,7 +1445,7 @@ func TestMultitenantAlertmanager_PerTenantSharding(t *testing.T) { amConfig.ShardingRing.RingCheckPeriod = time.Hour reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1597,7 +1600,7 @@ func TestMultitenantAlertmanager_SyncOnRingTopologyChanges(t *testing.T) { alertStore := prepareInMemoryAlertStore() reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) require.NoError(t, err) require.NoError(t, ringStore.CAS(ctx, RingKey, func(in interface{}) (interface{}, bool, error) { @@ -1648,7 +1651,7 @@ func TestMultitenantAlertmanager_RingLifecyclerShouldAutoForgetUnhealthyInstance alertStore := prepareInMemoryAlertStore() - am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) + am, err := createMultitenantAlertmanager(amConfig, nil, alertStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, am)) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1685,7 +1688,7 @@ func TestMultitenantAlertmanager_InitialSyncFailure(t *testing.T) { bkt.MockIter("alertmanager/", nil, nil) store := bucketclient.NewBucketAlertStore(bucketclient.BucketAlertStoreConfig{}, bkt, nil, log.NewNopLogger()) - am, err := createMultitenantAlertmanager(amConfig, nil, store, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) + am, err := createMultitenantAlertmanager(amConfig, nil, store, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), nil) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1728,7 +1731,7 @@ func TestAlertmanager_ReplicasPosition(t *testing.T) { amConfig.ShardingRing.RingCheckPeriod = time.Hour reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -1834,7 +1837,7 @@ func TestAlertmanager_StateReplication(t *testing.T) { amConfig.ShardingRing.RingCheckPeriod = time.Hour reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) require.NoError(t, err) defer services.StopAndAwaitTerminated(ctx, am) //nolint:errcheck @@ -2013,7 +2016,7 @@ func TestAlertmanager_StateReplication_InitialSyncFromPeers(t *testing.T) { amConfig.ShardingRing.RingCheckPeriod = time.Hour reg := prometheus.NewPedanticRegistry() - am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, nil, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) + am, err := createMultitenantAlertmanager(amConfig, nil, mockStore, ringStore, &mockAlertManagerLimits{}, featurecontrol.NoopFlags{}, log.NewNopLogger(), reg) require.NoError(t, err) clientPool.setServer(amConfig.ShardingRing.Common.InstanceAddr+":0", am) @@ -2353,6 +2356,8 @@ type mockAlertManagerLimits struct { emailNotificationRateLimit rate.Limit emailNotificationBurst int maxConfigSize int + maxSilencesCount int + maxSilenceSizeBytes int maxTemplatesCount int maxSizeOfTemplate int maxDispatcherAggregationGroups int @@ -2364,6 +2369,12 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxConfigSize(string) int { return m.maxConfigSize } +func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(string) int { return m.maxSilencesCount } + +func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(string) int { + return m.maxSilenceSizeBytes +} + func (m *mockAlertManagerLimits) AlertmanagerMaxTemplatesCount(string) int { return m.maxTemplatesCount } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 3a5d5c033ae..1fed78c4ecb 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -210,6 +210,8 @@ type Limits struct { NotificationRateLimitPerIntegration NotificationRateLimitMap `yaml:"alertmanager_notification_rate_limit_per_integration" json:"alertmanager_notification_rate_limit_per_integration"` AlertmanagerMaxConfigSizeBytes int `yaml:"alertmanager_max_config_size_bytes" json:"alertmanager_max_config_size_bytes"` + AlertmanagerMaxSilencesCount int `yaml:"alertmanager_max_silences_count" json:"alertmanager_max_silences_count"` + AlertmanagerMaxSilenceSizeBytes int `yaml:"alertmanager_max_silence_size_bytes" json:"alertmanager_max_silence_size_bytes"` AlertmanagerMaxTemplatesCount int `yaml:"alertmanager_max_templates_count" json:"alertmanager_max_templates_count"` AlertmanagerMaxTemplateSizeBytes int `yaml:"alertmanager_max_template_size_bytes" json:"alertmanager_max_template_size_bytes"` AlertmanagerMaxDispatcherAggregationGroups int `yaml:"alertmanager_max_dispatcher_aggregation_groups" json:"alertmanager_max_dispatcher_aggregation_groups"` @@ -335,6 +337,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { } f.Var(&l.NotificationRateLimitPerIntegration, "alertmanager.notification-rate-limit-per-integration", "Per-integration notification rate limits. Value is a map, where each key is integration name and value is a rate-limit (float). On command line, this map is given in JSON format. Rate limit has the same meaning as -alertmanager.notification-rate-limit, but only applies for specific integration. Allowed integration names: "+strings.Join(allowedIntegrationNames, ", ")+".") f.IntVar(&l.AlertmanagerMaxConfigSizeBytes, "alertmanager.max-config-size-bytes", 0, "Maximum size of configuration file for Alertmanager that tenant can upload via Alertmanager API. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilencesCount, "alertmanager.max-silences-count", 0, "Maximum number of active and pending silences that a tenant can have at once. 0 = no limit.") + f.IntVar(&l.AlertmanagerMaxSilenceSizeBytes, "alertmanager.max-silence-size-bytes", 0, "Maximum silence size in bytes. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxTemplatesCount, "alertmanager.max-templates-count", 0, "Maximum number of templates in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxTemplateSizeBytes, "alertmanager.max-template-size-bytes", 0, "Maximum size of single template in tenant's Alertmanager configuration uploaded via Alertmanager API. 0 = no limit.") f.IntVar(&l.AlertmanagerMaxDispatcherAggregationGroups, "alertmanager.max-dispatcher-aggregation-groups", 0, "Maximum number of aggregation groups in Alertmanager's dispatcher that a tenant can have. Each active aggregation group uses single goroutine. When the limit is reached, dispatcher will not dispatch alerts that belong to additional aggregation groups, but existing groups will keep working properly. 0 = no limit.") @@ -922,6 +926,14 @@ func (o *Overrides) AlertmanagerMaxConfigSize(userID string) int { return o.getOverridesForUser(userID).AlertmanagerMaxConfigSizeBytes } +func (o *Overrides) AlertmanagerMaxSilencesCount(userID string) int { + return o.getOverridesForUser(userID).AlertmanagerMaxSilencesCount +} + +func (o *Overrides) AlertmanagerMaxSilenceSizeBytes(userID string) int { + return o.getOverridesForUser(userID).AlertmanagerMaxSilenceSizeBytes +} + func (o *Overrides) AlertmanagerMaxTemplatesCount(userID string) int { return o.getOverridesForUser(userID).AlertmanagerMaxTemplatesCount } diff --git a/vendor/github.com/hashicorp/go-hclog/README.md b/vendor/github.com/hashicorp/go-hclog/README.md index 21a17c5af39..983d44c7db4 100644 --- a/vendor/github.com/hashicorp/go-hclog/README.md +++ b/vendor/github.com/hashicorp/go-hclog/README.md @@ -140,9 +140,10 @@ log.Printf("[DEBUG] %d", 42) ... [DEBUG] my-app: 42 ``` -Notice that if `appLogger` is initialized with the `INFO` log level _and_ you +Notice that if `appLogger` is initialized with the `INFO` log level, _and_ you specify `InferLevels: true`, you will not see any output here. You must change `appLogger` to `DEBUG` to see output. See the docs for more information. If the log lines start with a timestamp you can use the -`InferLevelsWithTimestamp` option to try and ignore them. +`InferLevelsWithTimestamp` option to try and ignore them. Please note that in order +for `InferLevelsWithTimestamp` to be relevant, `InferLevels` must be set to `true`. diff --git a/vendor/github.com/hashicorp/go-hclog/intlogger.go b/vendor/github.com/hashicorp/go-hclog/intlogger.go index b45064acf1a..104d82ff1ba 100644 --- a/vendor/github.com/hashicorp/go-hclog/intlogger.go +++ b/vendor/github.com/hashicorp/go-hclog/intlogger.go @@ -55,11 +55,25 @@ var ( faintBoldColor = color.New(color.Faint, color.Bold) faintColor = color.New(color.Faint) - faintMultiLinePrefix = faintColor.Sprint(" | ") - faintFieldSeparator = faintColor.Sprint("=") - faintFieldSeparatorWithNewLine = faintColor.Sprint("=\n") + faintMultiLinePrefix string + faintFieldSeparator string + faintFieldSeparatorWithNewLine string ) +func init() { + // Force all the colors to enabled because we do our own detection of color usage. + for _, c := range _levelToColor { + c.EnableColor() + } + + faintBoldColor.EnableColor() + faintColor.EnableColor() + + faintMultiLinePrefix = faintColor.Sprint(" | ") + faintFieldSeparator = faintColor.Sprint("=") + faintFieldSeparatorWithNewLine = faintColor.Sprint("=\n") +} + // Make sure that intLogger is a Logger var _ Logger = &intLogger{} @@ -79,6 +93,19 @@ type intLogger struct { writer *writer level *int32 + // The value of curEpoch when our level was set + setEpoch uint64 + + // The value of curEpoch the last time we performed the level sync process + ownEpoch uint64 + + // Shared amongst all the loggers created in this hierachy, used to determine + // if the level sync process should be run by comparing it with ownEpoch + curEpoch *uint64 + + // The logger this one was created from. Only set when syncParentLevel is set + parent *intLogger + headerColor ColorOption fieldColor ColorOption @@ -88,6 +115,7 @@ type intLogger struct { // create subloggers with their own level setting independentLevels bool + syncParentLevel bool subloggerHook func(sub Logger) Logger } @@ -129,9 +157,9 @@ func newLogger(opts *LoggerOptions) *intLogger { } var ( - primaryColor ColorOption = ColorOff - headerColor ColorOption = ColorOff - fieldColor ColorOption = ColorOff + primaryColor = ColorOff + headerColor = ColorOff + fieldColor = ColorOff ) switch { case opts.ColorHeaderOnly: @@ -152,8 +180,10 @@ func newLogger(opts *LoggerOptions) *intLogger { mutex: mutex, writer: newWriter(output, primaryColor), level: new(int32), + curEpoch: new(uint64), exclude: opts.Exclude, independentLevels: opts.IndependentLevels, + syncParentLevel: opts.SyncParentLevel, headerColor: headerColor, fieldColor: fieldColor, subloggerHook: opts.SubloggerHook, @@ -194,7 +224,7 @@ const offsetIntLogger = 3 // Log a message and a set of key/value pairs if the given level is at // or more severe that the threshold configured in the Logger. func (l *intLogger) log(name string, level Level, msg string, args ...interface{}) { - if level < Level(atomic.LoadInt32(l.level)) { + if level < l.GetLevel() { return } @@ -597,7 +627,7 @@ func (l *intLogger) logJSON(t time.Time, name string, level Level, msg string, a vals := l.jsonMapEntry(t, name, level, msg) args = append(l.implied, args...) - if args != nil && len(args) > 0 { + if len(args) > 0 { if len(args)%2 != 0 { cs, ok := args[len(args)-1].(CapturedStacktrace) if ok { @@ -718,27 +748,27 @@ func (l *intLogger) Error(msg string, args ...interface{}) { // Indicate that the logger would emit TRACE level logs func (l *intLogger) IsTrace() bool { - return Level(atomic.LoadInt32(l.level)) == Trace + return l.GetLevel() == Trace } // Indicate that the logger would emit DEBUG level logs func (l *intLogger) IsDebug() bool { - return Level(atomic.LoadInt32(l.level)) <= Debug + return l.GetLevel() <= Debug } // Indicate that the logger would emit INFO level logs func (l *intLogger) IsInfo() bool { - return Level(atomic.LoadInt32(l.level)) <= Info + return l.GetLevel() <= Info } // Indicate that the logger would emit WARN level logs func (l *intLogger) IsWarn() bool { - return Level(atomic.LoadInt32(l.level)) <= Warn + return l.GetLevel() <= Warn } // Indicate that the logger would emit ERROR level logs func (l *intLogger) IsError() bool { - return Level(atomic.LoadInt32(l.level)) <= Error + return l.GetLevel() <= Error } const MissingKey = "EXTRA_VALUE_AT_END" @@ -854,12 +884,63 @@ func (l *intLogger) resetOutput(opts *LoggerOptions) error { // Update the logging level on-the-fly. This will affect all subloggers as // well. func (l *intLogger) SetLevel(level Level) { - atomic.StoreInt32(l.level, int32(level)) + if !l.syncParentLevel { + atomic.StoreInt32(l.level, int32(level)) + return + } + + nsl := new(int32) + *nsl = int32(level) + + l.level = nsl + + l.ownEpoch = atomic.AddUint64(l.curEpoch, 1) + l.setEpoch = l.ownEpoch +} + +func (l *intLogger) searchLevelPtr() *int32 { + p := l.parent + + ptr := l.level + + max := l.setEpoch + + for p != nil { + if p.setEpoch > max { + max = p.setEpoch + ptr = p.level + } + + p = p.parent + } + + return ptr } // Returns the current level func (l *intLogger) GetLevel() Level { - return Level(atomic.LoadInt32(l.level)) + // We perform the loads immediately to keep the CPU pipeline busy, which + // effectively makes the second load cost nothing. Once loaded into registers + // the comparison returns the already loaded value. The comparison is almost + // always true, so the branch predictor should hit consistently with it. + var ( + curEpoch = atomic.LoadUint64(l.curEpoch) + level = Level(atomic.LoadInt32(l.level)) + own = l.ownEpoch + ) + + if curEpoch == own { + return level + } + + // Perform the level sync process. We'll avoid doing this next time by seeing the + // epoch as current. + + ptr := l.searchLevelPtr() + l.level = ptr + l.ownEpoch = curEpoch + + return Level(atomic.LoadInt32(ptr)) } // Create a *log.Logger that will send it's data through this Logger. This @@ -912,6 +993,8 @@ func (l *intLogger) copy() *intLogger { if l.independentLevels { sl.level = new(int32) *sl.level = *l.level + } else if l.syncParentLevel { + sl.parent = l } return &sl diff --git a/vendor/github.com/hashicorp/go-hclog/logger.go b/vendor/github.com/hashicorp/go-hclog/logger.go index 947ac0c9afc..d7806fb5e8b 100644 --- a/vendor/github.com/hashicorp/go-hclog/logger.go +++ b/vendor/github.com/hashicorp/go-hclog/logger.go @@ -233,6 +233,7 @@ type StandardLoggerOptions struct { // [DEBUG] and strip it off before reapplying it. // The timestamp detection may result in false positives and incomplete // string outputs. + // InferLevelsWithTimestamp is only relevant if InferLevels is true. InferLevelsWithTimestamp bool // ForceLevel is used to force all output from the standard logger to be at @@ -303,6 +304,24 @@ type LoggerOptions struct { // will not affect the parent or sibling loggers. IndependentLevels bool + // When set, changing the level of a logger effects only it's direct sub-loggers + // rather than all sub-loggers. For example: + // a := logger.Named("a") + // a.SetLevel(Error) + // b := a.Named("b") + // c := a.Named("c") + // b.GetLevel() => Error + // c.GetLevel() => Error + // b.SetLevel(Info) + // a.GetLevel() => Error + // b.GetLevel() => Info + // c.GetLevel() => Error + // a.SetLevel(Warn) + // a.GetLevel() => Warn + // b.GetLevel() => Warn + // c.GetLevel() => Warn + SyncParentLevel bool + // SubloggerHook registers a function that is called when a sublogger via // Named, With, or ResetNamed is created. If defined, the function is passed // the newly created Logger and the returned Logger is returned from the diff --git a/vendor/github.com/prometheus/alertmanager/silence/silence.go b/vendor/github.com/prometheus/alertmanager/silence/silence.go index 84507bc5be2..ec063edd540 100644 --- a/vendor/github.com/prometheus/alertmanager/silence/silence.go +++ b/vendor/github.com/prometheus/alertmanager/silence/silence.go @@ -193,6 +193,7 @@ type Silences struct { logger log.Logger metrics *metrics retention time.Duration + limits Limits mtx sync.RWMutex st state @@ -201,6 +202,16 @@ type Silences struct { mc matcherCache } +// Limits contains the limits for silences. +type Limits struct { + // MaxSilences limits the maximum number active and pending silences. + // It does not include expired silences. + MaxSilences int + // MaxPerSilenceBytes is the maximum size of an individual silence as + // stored on disk. + MaxPerSilenceBytes int +} + // MaintenanceFunc represents the function to run as part of the periodic maintenance for silences. // It returns the size of the snapshot taken or an error if it failed. type MaintenanceFunc func() (int64, error) @@ -318,6 +329,7 @@ type Options struct { // Retention time for newly created Silences. Silences may be // garbage collected after the given duration after they ended. Retention time.Duration + Limits Limits // A logger used by background processing. Logger log.Logger @@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) { mc: matcherCache{}, logger: log.NewNopLogger(), retention: o.Retention, + limits: o.Limits, broadcast: func([]byte) {}, st: state{}, } @@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool) return err } + // Check the limit unless the silence has been expired. This is to avoid + // situations where silences cannot be expired after the limit has been + // reduced. + if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) { + return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes) + } + if s.st.merge(msil, now) { s.version++ } @@ -608,10 +628,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) { func (s *Silences) set(sil *pb.Silence) (string, error) { now := s.nowUTC() prev, ok := s.getSilence(sil.Id) - if sil.Id != "" && !ok { return "", ErrNotFound } + if ok { if canUpdate(prev, sil, now) { return sil.Id, s.setSilence(sil, now, false) @@ -623,7 +643,24 @@ func (s *Silences) set(sil *pb.Silence) (string, error) { } } } + // If we got here it's either a new silence or a replacing one. + if s.limits.MaxSilences > 0 { + // Get the number of active and pending silences to enforce limits. + q := &query{} + err := QState(types.SilenceStateActive, types.SilenceStatePending)(q) + if err != nil { + return "", fmt.Errorf("unable to query silences while checking limits: %w", err) + } + sils, _, err := s.query(q, s.nowUTC()) + if err != nil { + return "", fmt.Errorf("unable to query silences while checking limits: %w", err) + } + if len(sils)+1 > s.limits.MaxSilences { + return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences) + } + } + uid, err := uuid.NewV4() if err != nil { return "", fmt.Errorf("generate uuid: %w", err) @@ -634,7 +671,11 @@ func (s *Silences) set(sil *pb.Silence) (string, error) { sil.StartsAt = now } - return sil.Id, s.setSilence(sil, now, false) + if err = s.setSilence(sil, now, false); err != nil { + return "", err + } + + return sil.Id, nil } // canUpdate returns true if silence a can be updated to b without @@ -778,6 +819,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) { // Query for silences based on the given query parameters. It returns the // resulting silences and the state version the result is based on. func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) { + s.mtx.Lock() + defer s.mtx.Unlock() + s.metrics.queriesTotal.Inc() defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration() @@ -817,9 +861,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) { // the use of post-filter functions is the trivial solution for now. var res []*pb.Silence - s.mtx.Lock() - defer s.mtx.Unlock() - if q.ids != nil { for _, id := range q.ids { if s, ok := s.st[id]; ok { diff --git a/vendor/modules.txt b/vendor/modules.txt index 351a7a6a171..9b3f0fbaee9 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -646,7 +646,7 @@ github.com/hashicorp/errwrap # github.com/hashicorp/go-cleanhttp v0.5.2 ## explicit; go 1.13 github.com/hashicorp/go-cleanhttp -# github.com/hashicorp/go-hclog v1.5.0 +# github.com/hashicorp/go-hclog v1.6.2 ## explicit; go 1.13 github.com/hashicorp/go-hclog # github.com/hashicorp/go-immutable-radix v1.3.1 @@ -852,7 +852,7 @@ github.com/pkg/errors # github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 ## explicit github.com/pmezard/go-difflib/difflib -# github.com/prometheus/alertmanager v0.27.0 => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240524091923-8090d8837b5f +# github.com/prometheus/alertmanager v0.27.0 => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240531172444-6ad94e405c5a ## explicit; go 1.21 github.com/prometheus/alertmanager/api github.com/prometheus/alertmanager/api/metrics @@ -1557,4 +1557,4 @@ sigs.k8s.io/yaml/goyaml.v3 # github.com/munnerz/goautoneg => github.com/grafana/goautoneg v0.0.0-20231010094147-47ce5e72a9ae # github.com/opentracing-contrib/go-stdlib => github.com/grafana/opentracing-contrib-go-stdlib v0.0.0-20230509071955-f410e79da956 # github.com/opentracing-contrib/go-grpc => github.com/charleskorn/go-grpc v0.0.0-20231024023642-e9298576254f -# github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240524091923-8090d8837b5f +# github.com/prometheus/alertmanager => github.com/grafana/prometheus-alertmanager v0.25.1-0.20240531172444-6ad94e405c5a