diff --git a/CHANGELOG.md b/CHANGELOG.md index 60e7b15ea1..36e54e2c7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## master / unreleased +* [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910 +* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. Support for this migration will be removed in Cortex 1.10. #3910 * [ENHANCEMENT] Ruler: optimized `/api/v1/rules` and `/api/v1/alerts` when ruler sharding is enabled. #3916 * [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916 * `cortex_ruler_clients` diff --git a/development/tsdb-blocks-storage-s3/config/rules.yaml b/development/tsdb-blocks-storage-s3/config/rules.yaml index acffa8ada9..b65a4590da 100644 --- a/development/tsdb-blocks-storage-s3/config/rules.yaml +++ b/development/tsdb-blocks-storage-s3/config/rules.yaml @@ -4,3 +4,13 @@ groups: rules: - record: up:count expr: count(up) + + - name: example2 + rules: + - alert: TooManyServices + expr: count(up) > 1 + for: 1m + labels: + severity: page + annotations: + summary: Too many services diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index b0eaf0b81a..4e84db0d9e 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -43,19 +43,28 @@ import ( "github.com/prometheus/common/route" ) -const notificationLogMaintenancePeriod = 15 * time.Minute +const ( + // MaintenancePeriod is used for periodic storing of silences and notifications to local file. + maintenancePeriod = 15 * time.Minute + + // Filenames used within tenant-directory + notificationLogSnapshot = "notifications" + silencesSnapshot = "silences" + templatesDir = "templates" +) // Config configures an Alertmanager. type Config struct { - UserID string - // Used to persist notification logs and silences on disk. - DataDir string + UserID string Logger log.Logger Peer *cluster.Peer PeerTimeout time.Duration Retention time.Duration ExternalURL *url.URL + // Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed. + TenantDataDir string + ShardingEnabled bool ReplicationFactor int ReplicateStateFunc func(context.Context, string, *clusterpb.Part) error @@ -115,6 +124,10 @@ type State interface { // New creates a new Alertmanager. func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { + if cfg.TenantDataDir == "" { + return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured") + } + am := &Alertmanager{ cfg: cfg, logger: log.With(cfg.Logger, "user", cfg.UserID), @@ -144,12 +157,11 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { } am.wg.Add(1) - nflogID := fmt.Sprintf("nflog:%s", cfg.UserID) var err error am.nflog, err = nflog.New( nflog.WithRetention(cfg.Retention), - nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogID)), - nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done), + nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)), + nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done), nflog.WithMetrics(am.registry), nflog.WithLogger(log.With(am.logger, "component", "nflog")), ) @@ -162,9 +174,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.marker = types.NewMarker(am.registry) - silencesID := fmt.Sprintf("silences:%s", cfg.UserID) + silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot) am.silences, err = silence.New(silence.Options{ - SnapshotFile: filepath.Join(cfg.DataDir, silencesID), + SnapshotFile: silencesFile, Retention: cfg.Retention, Logger: log.With(am.logger, "component", "silences"), Metrics: am.registry, @@ -180,7 +192,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { am.wg.Add(1) go func() { - am.silences.Maintenance(15*time.Minute, filepath.Join(cfg.DataDir, silencesID), am.stop) + am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop) am.wg.Done() }() @@ -240,7 +252,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s templateFiles := make([]string, len(conf.Templates)) if len(conf.Templates) > 0 { for i, t := range conf.Templates { - templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t) + templateFiles[i] = filepath.Join(am.cfg.TenantDataDir, templatesDir, t) } } diff --git a/pkg/alertmanager/api.go b/pkg/alertmanager/api.go index 95b2a9d2e7..74c8dbaf85 100644 --- a/pkg/alertmanager/api.go +++ b/pkg/alertmanager/api.go @@ -153,14 +153,14 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error { // not to configured data dir, and on the flipside, it'll fail if we can't write // to tmpDir. Ignoring both cases for now as they're ultra rare but will revisit if // we see this in the wild. - tmpDir, err := ioutil.TempDir("", "validate-config") + userTempDir, err := ioutil.TempDir("", "validate-config-"+cfg.User) if err != nil { return err } - defer os.RemoveAll(tmpDir) + defer os.RemoveAll(userTempDir) for _, tmpl := range cfg.Templates { - _, err := createTemplateFile(tmpDir, cfg.User, tmpl.Filename, tmpl.Body) + _, err := storeTemplateFile(userTempDir, tmpl.Filename, tmpl.Body) if err != nil { level.Error(logger).Log("msg", "unable to create template file", "err", err, "user", cfg.User) return fmt.Errorf("unable to create template file '%s'", tmpl.Filename) @@ -169,7 +169,7 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error { templateFiles := make([]string, len(amCfg.Templates)) for i, t := range amCfg.Templates { - templateFiles[i] = filepath.Join(tmpDir, "templates", cfg.User, t) + templateFiles[i] = filepath.Join(userTempDir, t) } _, err = template.FromGlobs(templateFiles...) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 23f7ef2dcc..e7f4b63365 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -10,6 +10,7 @@ import ( "net/url" "os" "path/filepath" + "strings" "sync" "time" @@ -21,6 +22,7 @@ import ( amconfig "github.com/prometheus/alertmanager/config" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" "github.com/weaveworks/common/httpgrpc" "github.com/weaveworks/common/httpgrpc/server" "github.com/weaveworks/common/user" @@ -447,6 +449,11 @@ func (h *handlerForGRPCServer) ServeHTTP(w http.ResponseWriter, req *http.Reques } func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { + err = am.migrateStateFilesToPerTenantDirectories() + if err != nil { + return err + } + defer func() { if err == nil || am.subservices == nil { return @@ -500,6 +507,119 @@ func (am *MultitenantAlertmanager) starting(ctx context.Context) (err error) { return nil } +// migrateStateFilesToPerTenantDirectories migrates any existing configuration from old place to new hierarchy. +// TODO: Remove in Cortex 1.10. +func (am *MultitenantAlertmanager) migrateStateFilesToPerTenantDirectories() error { + migrate := func(from, to string) error { + level.Info(am.logger).Log("msg", "migrating alertmanager state", "from", from, "to", to) + err := os.Rename(from, to) + return errors.Wrapf(err, "failed to migrate alertmanager state from %v to %v", from, to) + } + + st, err := am.getObsoleteFilesPerUser() + if err != nil { + return errors.Wrap(err, "failed to migrate alertmanager state files") + } + + for userID, files := range st { + tenantDir := am.getTenantDirectory(userID) + err := os.MkdirAll(tenantDir, 0777) + if err != nil { + return errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) + } + + errs := tsdb_errors.NewMulti() + + if files.notificationLogSnapshot != "" { + errs.Add(migrate(files.notificationLogSnapshot, filepath.Join(tenantDir, notificationLogSnapshot))) + } + + if files.silencesSnapshot != "" { + errs.Add(migrate(files.silencesSnapshot, filepath.Join(tenantDir, silencesSnapshot))) + } + + if files.templatesDir != "" { + errs.Add(migrate(files.templatesDir, filepath.Join(tenantDir, templatesDir))) + } + + if err := errs.Err(); err != nil { + return err + } + } + return nil +} + +type obsoleteStateFiles struct { + notificationLogSnapshot string + silencesSnapshot string + templatesDir string +} + +// getObsoleteFilesPerUser returns per-user set of files that should be migrated from old structure to new structure. +func (am *MultitenantAlertmanager) getObsoleteFilesPerUser() (map[string]obsoleteStateFiles, error) { + files, err := ioutil.ReadDir(am.cfg.DataDir) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", am.cfg.DataDir) + } + + // old names + const ( + notificationLogPrefix = "nflog:" + silencesPrefix = "silences:" + templates = "templates" + ) + + result := map[string]obsoleteStateFiles{} + + for _, f := range files { + fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + + if f.IsDir() { + // Process templates dir. + if f.Name() != templates { + // Ignore other files -- those are likely per tenant directories. + continue + } + + templateDirs, err := ioutil.ReadDir(fullPath) + if err != nil { + return nil, errors.Wrapf(err, "failed to list dir %v", fullPath) + } + + // Previously templates directory contained per-tenant subdirectory. + for _, d := range templateDirs { + if d.IsDir() { + v := result[d.Name()] + v.templatesDir = filepath.Join(fullPath, d.Name()) + result[d.Name()] = v + } else { + level.Warn(am.logger).Log("msg", "ignoring unknown local file while migrating local alertmanager state files", "file", filepath.Join(fullPath, d.Name())) + } + } + continue + } + + switch { + case strings.HasPrefix(f.Name(), notificationLogPrefix): + userID := strings.TrimPrefix(f.Name(), notificationLogPrefix) + v := result[userID] + v.notificationLogSnapshot = fullPath + result[userID] = v + + case strings.HasPrefix(f.Name(), silencesPrefix): + userID := strings.TrimPrefix(f.Name(), silencesPrefix) + v := result[userID] + v.silencesSnapshot = fullPath + result[userID] = v + + default: + level.Warn(am.logger).Log("msg", "ignoring unknown local data file while migrating local alertmanager state files", "file", fullPath) + } + } + + return result, nil +} + func (am *MultitenantAlertmanager) run(ctx context.Context) error { tick := time.NewTicker(am.cfg.PollInterval) defer tick.Stop() @@ -551,6 +671,8 @@ func (am *MultitenantAlertmanager) loadAndSyncConfigs(ctx context.Context, syncR } am.syncConfigs(cfgs) + am.deleteUnusedLocalUserState() + return nil } @@ -636,20 +758,27 @@ func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertCon am.multitenantMetrics.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() } + userAlertmanagersToStop := map[string]*Alertmanager{} + am.alertmanagersMtx.Lock() - defer am.alertmanagersMtx.Unlock() for userID, userAM := range am.alertmanagers { if _, exists := cfgs[userID]; !exists { - level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID) - userAM.Stop() + userAlertmanagersToStop[userID] = userAM delete(am.alertmanagers, userID) delete(am.cfgs, userID) am.multitenantMetrics.lastReloadSuccessful.DeleteLabelValues(userID) am.multitenantMetrics.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) am.alertmanagerMetrics.removeUserRegistry(userID) - level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID) } } + am.alertmanagersMtx.Unlock() + + // Now stop alertmanagers and wait until they are really stopped, without holding lock. + for userID, userAM := range userAlertmanagersToStop { + level.Info(am.logger).Log("msg", "deactivating per-tenant alertmanager", "user", userID) + userAM.StopAndWait() + level.Info(am.logger).Log("msg", "deactivated per-tenant alertmanager", "user", userID) + } } // setConfig applies the given configuration to the alertmanager for `userID`, @@ -660,7 +789,7 @@ func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error var hasTemplateChanges bool for _, tmpl := range cfg.Templates { - hasChanged, err := createTemplateFile(am.cfg.DataDir, cfg.User, tmpl.Filename, tmpl.Body) + hasChanged, err := storeTemplateFile(filepath.Join(am.getTenantDirectory(cfg.User), templatesDir), tmpl.Filename, tmpl.Body) if err != nil { return err } @@ -742,12 +871,22 @@ func (am *MultitenantAlertmanager) setConfig(cfg alertspb.AlertConfigDesc) error return nil } +func (am *MultitenantAlertmanager) getTenantDirectory(userID string) string { + return filepath.Join(am.cfg.DataDir, userID) +} + func (am *MultitenantAlertmanager) newAlertmanager(userID string, amConfig *amconfig.Config, rawCfg string) (*Alertmanager, error) { reg := prometheus.NewRegistry() + tenantDir := am.getTenantDirectory(userID) + err := os.MkdirAll(tenantDir, 0777) + if err != nil { + return nil, errors.Wrapf(err, "failed to create per-tenant directory %v", tenantDir) + } + newAM, err := New(&Config{ UserID: userID, - DataDir: am.cfg.DataDir, + TenantDataDir: tenantDir, Logger: util_log.Logger, Peer: am.peer, PeerTimeout: am.cfg.Cluster.PeerTimeout, @@ -944,6 +1083,54 @@ func (am *MultitenantAlertmanager) UpdateState(ctx context.Context, part *cluste return &alertmanagerpb.UpdateStateResponse{Status: alertmanagerpb.OK}, nil } +// deleteUnusedLocalUserState deletes local files for users that we no longer need. +func (am *MultitenantAlertmanager) deleteUnusedLocalUserState() { + userDirs := am.getPerUserDirectories() + + // And delete remaining files. + for userID, dir := range userDirs { + am.alertmanagersMtx.Lock() + userAM := am.alertmanagers[userID] + am.alertmanagersMtx.Unlock() + + // Don't delete directory if AM for user still exists. + if userAM != nil { + continue + } + + err := os.RemoveAll(dir) + if err != nil { + level.Warn(am.logger).Log("msg", "failed to delete directory for user", "dir", dir, "user", userID, "err", err) + } else { + level.Info(am.logger).Log("msg", "deleted local directory for user", "dir", dir, "user", userID) + } + } +} + +// getPerUserDirectories returns map of users to their directories (full path). Only users with local +// directory are returned. +func (am *MultitenantAlertmanager) getPerUserDirectories() map[string]string { + files, err := ioutil.ReadDir(am.cfg.DataDir) + if err != nil { + level.Warn(am.logger).Log("msg", "failed to list local dir", "dir", am.cfg.DataDir, "err", err) + return nil + } + + result := map[string]string{} + + for _, f := range files { + fullPath := filepath.Join(am.cfg.DataDir, f.Name()) + + if !f.IsDir() { + level.Warn(am.logger).Log("msg", "ignoring unexpected file while scanning local alertmanager configs", "file", fullPath) + continue + } + + result[f.Name()] = fullPath + } + return result +} + // StatusHandler shows the status of the alertmanager. type StatusHandler struct { am *MultitenantAlertmanager @@ -957,21 +1144,26 @@ func (s StatusHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) { } } -func createTemplateFile(dataDir, userID, fn, content string) (bool, error) { - if fn != filepath.Base(fn) { - return false, fmt.Errorf("template file name '%s' is not not valid", fn) +// storeTemplateFile stores template file with given content into specific directory. +// Since templateFileName is provided by end-user, it is verified that it doesn't do any path-traversal. +// Returns true, if file content has changed (new or updated file), false if file with the same name +// and content was already stored locally. +func storeTemplateFile(dir, templateFileName, content string) (bool, error) { + if templateFileName != filepath.Base(templateFileName) { + return false, fmt.Errorf("template file name '%s' is not not valid", templateFileName) } - dir := filepath.Join(dataDir, "templates", userID, filepath.Dir(fn)) err := os.MkdirAll(dir, 0755) if err != nil { return false, fmt.Errorf("unable to create Alertmanager templates directory %q: %s", dir, err) } - file := filepath.Join(dir, fn) + file := filepath.Join(dir, templateFileName) // Check if the template file already exists and if it has changed if tmpl, err := ioutil.ReadFile(file); err == nil && string(tmpl) == content { return false, nil + } else if err != nil && !os.IsNotExist(err) { + return false, err } if err := ioutil.WriteFile(file, []byte(content), 0644); err != nil { diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index d8b46a4dd2..6f295f294b 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -12,6 +12,7 @@ import ( "net/http/httptest" "net/http/pprof" "os" + "path/filepath" "regexp" "strings" "testing" @@ -106,8 +107,8 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { require.NoError(t, err) require.Len(t, am.alertmanagers, 2) - currentConfig, exists := am.cfgs["user1"] - require.True(t, exists) + currentConfig, cfgExists := am.cfgs["user1"] + require.True(t, cfgExists) require.Equal(t, simpleConfigOne, currentConfig.RawConfig) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` @@ -118,16 +119,38 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is added, it is synced correctly - require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ - User: "user3", - RawConfig: simpleConfigOne, - Templates: []*alertspb.TemplateDesc{}, - })) + user3Cfg := alertspb.AlertConfigDesc{ + User: "user3", + RawConfig: simpleConfigOne + ` +templates: +- 'first.tpl' +- 'second.tpl' +`, + Templates: []*alertspb.TemplateDesc{ + { + Filename: "first.tpl", + Body: `{{ define "t1" }}Template 1 ... {{end}}`, + }, + { + Filename: "second.tpl", + Body: `{{ define "t2" }}Template 2{{ end}}`, + }, + }, + } + require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) require.Len(t, am.alertmanagers, 3) + dirs := am.getPerUserDirectories() + user3Dir := dirs["user3"] + require.NotZero(t, user3Dir) + require.True(t, dirExists(t, user3Dir)) + require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) + assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. # TYPE cortex_alertmanager_config_last_reload_successful gauge @@ -146,20 +169,25 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user1"] - require.True(t, exists) + currentConfig, cfgExists = am.cfgs["user1"] + require.True(t, cfgExists) require.Equal(t, simpleConfigTwo, currentConfig.RawConfig) // Test Delete User, ensure config is removed and the resources are freed. require.NoError(t, store.DeleteAlertConfig(ctx, "user3")) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user3"] - require.False(t, exists) + currentConfig, cfgExists = am.cfgs["user3"] + require.False(t, cfgExists) require.Equal(t, "", currentConfig.RawConfig) - _, exists = am.alertmanagers["user3"] - require.False(t, exists) + _, cfgExists = am.alertmanagers["user3"] + require.False(t, cfgExists) + dirs = am.getPerUserDirectories() + require.NotZero(t, dirs["user1"]) + require.NotZero(t, dirs["user2"]) + require.Zero(t, dirs["user3"]) // User3 is deleted, so we should have no more files for it. + require.False(t, fileExists(t, user3Dir)) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -169,21 +197,27 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) // Ensure when a 3rd config is re-added, it is synced correctly - require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ - User: "user3", - RawConfig: simpleConfigOne, - Templates: []*alertspb.TemplateDesc{}, - })) + require.NoError(t, store.SetAlertConfig(ctx, user3Cfg)) err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) require.NoError(t, err) - currentConfig, exists = am.cfgs["user3"] - require.True(t, exists) - require.Equal(t, simpleConfigOne, currentConfig.RawConfig) + currentConfig, cfgExists = am.cfgs["user3"] + require.True(t, cfgExists) + require.Equal(t, user3Cfg.RawConfig, currentConfig.RawConfig) - _, exists = am.alertmanagers["user3"] - require.True(t, exists) + _, cfgExists = am.alertmanagers["user3"] + require.True(t, cfgExists) + dirs = am.getPerUserDirectories() + require.NotZero(t, dirs["user1"]) + require.NotZero(t, dirs["user2"]) + require.Equal(t, user3Dir, dirs["user3"]) // Dir should exist, even though state files are not generated yet. + + // Hierarchy that existed before should exist again. + require.True(t, dirExists(t, user3Dir)) + require.True(t, dirExists(t, filepath.Join(user3Dir, templatesDir))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "first.tpl"))) + require.True(t, fileExists(t, filepath.Join(user3Dir, templatesDir, "second.tpl"))) assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` # HELP cortex_alertmanager_config_last_reload_successful Boolean set to 1 whenever the last configuration reload attempt was successful. @@ -194,6 +228,110 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs(t *testing.T) { `), "cortex_alertmanager_config_last_reload_successful")) } +func TestMultitenantAlertmanager_migrateStateFilesToPerTenantDirectories(t *testing.T) { + ctx := context.Background() + + const ( + user1 = "user1" + user2 = "user2" + ) + + store := prepareInMemoryAlertStore() + require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user2, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + + reg := prometheus.NewPedanticRegistry() + cfg := mockAlertmanagerConfig(t) + am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, log.NewNopLogger(), reg) + require.NoError(t, err) + + createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user1)) + createFile(t, filepath.Join(cfg.DataDir, "silences:"+user1)) + createFile(t, filepath.Join(cfg.DataDir, "nflog:"+user2)) + createFile(t, filepath.Join(cfg.DataDir, "templates", user2, "template.tpl")) + + require.NoError(t, am.migrateStateFilesToPerTenantDirectories()) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot))) + require.True(t, dirExists(t, filepath.Join(cfg.DataDir, user2, templatesDir))) + require.True(t, fileExists(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl"))) +} + +func fileExists(t *testing.T, path string) bool { + return checkExists(t, path, false) +} + +func dirExists(t *testing.T, path string) bool { + return checkExists(t, path, true) +} + +func checkExists(t *testing.T, path string, dir bool) bool { + fi, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + return false + } + require.NoError(t, err) + } + + require.Equal(t, dir, fi.IsDir()) + return true +} + +func TestMultitenantAlertmanager_deleteUnusedLocalUserState(t *testing.T) { + ctx := context.Background() + + const ( + user1 = "user1" + user2 = "user2" + ) + + store := prepareInMemoryAlertStore() + require.NoError(t, store.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ + User: user2, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + })) + + reg := prometheus.NewPedanticRegistry() + cfg := mockAlertmanagerConfig(t) + am, err := createMultitenantAlertmanager(cfg, nil, nil, store, nil, log.NewNopLogger(), reg) + require.NoError(t, err) + + createFile(t, filepath.Join(cfg.DataDir, user1, notificationLogSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user1, silencesSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user2, notificationLogSnapshot)) + createFile(t, filepath.Join(cfg.DataDir, user2, templatesDir, "template.tpl")) + + dirs := am.getPerUserDirectories() + require.Equal(t, 2, len(dirs)) + require.NotZero(t, dirs[user1]) + require.NotZero(t, dirs[user2]) + + // Ensure the configs are synced correctly + err = am.loadAndSyncConfigs(context.Background(), reasonPeriodic) + require.NoError(t, err) + + // loadAndSyncConfigs also cleans up obsolete files. Let's verify that. + dirs = am.getPerUserDirectories() + + require.Zero(t, dirs[user1]) // has no configuration, files were deleted + require.NotZero(t, dirs[user2]) // has config, files survived +} + +func createFile(t *testing.T, path string) string { + dir := filepath.Dir(path) + require.NoError(t, os.MkdirAll(dir, 0777)) + f, err := os.Create(path) + require.NoError(t, err) + require.NoError(t, f.Close()) + return path +} + func TestMultitenantAlertmanager_NoExternalURL(t *testing.T) { amConfig := mockAlertmanagerConfig(t) amConfig.ExternalURL = flagext.URLValue{} // no external URL @@ -1055,3 +1193,36 @@ func TestAlertmanager_StateReplicationWithSharding(t *testing.T) { func prepareInMemoryAlertStore() alertstore.AlertStore { return bucketclient.NewBucketAlertStore(objstore.NewInMemBucket(), nil, log.NewNopLogger()) } + +func TestStoreTemplateFile(t *testing.T) { + tempDir, err := ioutil.TempDir(os.TempDir(), "alertmanager") + require.NoError(t, err) + + t.Cleanup(func() { + require.NoError(t, os.RemoveAll(tempDir)) + }) + + changed, err := storeTemplateFile(templatesDir, "some-template", "content") + require.NoError(t, err) + require.True(t, changed) + + changed, err = storeTemplateFile(templatesDir, "some-template", "new content") + require.NoError(t, err) + require.True(t, changed) + + changed, err = storeTemplateFile(templatesDir, "some-template", "new content") // reusing previous content + require.NoError(t, err) + require.False(t, changed) + + _, err = storeTemplateFile(templatesDir, ".", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "..", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "./test", "content") + require.Error(t, err) + + _, err = storeTemplateFile(templatesDir, "../test", "content") + require.Error(t, err) +}