Skip to content

Commit

Permalink
Cleanup obsolete local files for alertmanager. (cortexproject#3910)
Browse files Browse the repository at this point in the history
* Cleanup obsolete local files for alertmanager.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* CHANGELOG.md

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Comment.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Don't ignore directories. Log error when deletion fails instead.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Address review feedback.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Move per-tenant state into tenant directory to simplify cleanup.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Move migration to separate function.
Add test for migration.
Fix test for deletion of unused dirs.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Store templates to correct place.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* CHANGELOG.md

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Verify that templates are stored properly into correct location.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Comments.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Comments.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Apply suggestions from code review

Co-authored-by: Marco Pracucci <marco@pracucci.com>
Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

* Review feedback.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>

Co-authored-by: Marco Pracucci <marco@pracucci.com>
  • Loading branch information
2 people authored and harry671003 committed Mar 11, 2021
1 parent e841b03 commit 32f1847
Show file tree
Hide file tree
Showing 6 changed files with 436 additions and 49 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## master / unreleased

* [CHANGE] Alertmanager now removes local files after Alertmanager is no longer running for removed or resharded user. #3910
* [CHANGE] Alertmanager now stores local files in per-tenant folders. Files stored by Alertmanager previously are migrated to new hierarchy. Support for this migration will be removed in Cortex 1.10. #3910
* [ENHANCEMENT] Ruler: optimized `<prefix>/api/v1/rules` and `<prefix>/api/v1/alerts` when ruler sharding is enabled. #3916
* [ENHANCEMENT] Ruler: added the following metrics when ruler sharding is enabled: #3916
* `cortex_ruler_clients`
Expand Down
10 changes: 10 additions & 0 deletions development/tsdb-blocks-storage-s3/config/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,13 @@ groups:
rules:
- record: up:count
expr: count(up)

- name: example2
rules:
- alert: TooManyServices
expr: count(up) > 1
for: 1m
labels:
severity: page
annotations:
summary: Too many services
34 changes: 23 additions & 11 deletions pkg/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,28 @@ import (
"github.com/cortexproject/cortex/pkg/util/services"
)

const notificationLogMaintenancePeriod = 15 * time.Minute
const (
// MaintenancePeriod is used for periodic storing of silences and notifications to local file.
maintenancePeriod = 15 * time.Minute

// Filenames used within tenant-directory
notificationLogSnapshot = "notifications"
silencesSnapshot = "silences"
templatesDir = "templates"
)

// Config configures an Alertmanager.
type Config struct {
UserID string
// Used to persist notification logs and silences on disk.
DataDir string
UserID string
Logger log.Logger
Peer *cluster.Peer
PeerTimeout time.Duration
Retention time.Duration
ExternalURL *url.URL

// Tenant-specific local directory where AM can store its state (notifications, silences, templates). When AM is stopped, entire dir is removed.
TenantDataDir string

ShardingEnabled bool
ReplicationFactor int
ReplicateStateFunc func(context.Context, string, *clusterpb.Part) error
Expand Down Expand Up @@ -118,6 +127,10 @@ type State interface {

// New creates a new Alertmanager.
func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
if cfg.TenantDataDir == "" {
return nil, fmt.Errorf("directory for tenant-specific AlertManager is not configured")
}

am := &Alertmanager{
cfg: cfg,
logger: log.With(cfg.Logger, "user", cfg.UserID),
Expand Down Expand Up @@ -153,12 +166,11 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
}

am.wg.Add(1)
nflogID := fmt.Sprintf("nflog:%s", cfg.UserID)
var err error
am.nflog, err = nflog.New(
nflog.WithRetention(cfg.Retention),
nflog.WithSnapshot(filepath.Join(cfg.DataDir, nflogID)),
nflog.WithMaintenance(notificationLogMaintenancePeriod, am.stop, am.wg.Done),
nflog.WithSnapshot(filepath.Join(cfg.TenantDataDir, notificationLogSnapshot)),
nflog.WithMaintenance(maintenancePeriod, am.stop, am.wg.Done),
nflog.WithMetrics(am.registry),
nflog.WithLogger(log.With(am.logger, "component", "nflog")),
)
Expand All @@ -171,9 +183,9 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {

am.marker = types.NewMarker(am.registry)

silencesID := fmt.Sprintf("silences:%s", cfg.UserID)
silencesFile := filepath.Join(cfg.TenantDataDir, silencesSnapshot)
am.silences, err = silence.New(silence.Options{
SnapshotFile: filepath.Join(cfg.DataDir, silencesID),
SnapshotFile: silencesFile,
Retention: cfg.Retention,
Logger: log.With(am.logger, "component", "silences"),
Metrics: am.registry,
Expand All @@ -189,7 +201,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {

am.wg.Add(1)
go func() {
am.silences.Maintenance(15*time.Minute, filepath.Join(cfg.DataDir, silencesID), am.stop)
am.silences.Maintenance(maintenancePeriod, silencesFile, am.stop)
am.wg.Done()
}()

Expand Down Expand Up @@ -249,7 +261,7 @@ func (am *Alertmanager) ApplyConfig(userID string, conf *config.Config, rawCfg s
templateFiles := make([]string, len(conf.Templates))
if len(conf.Templates) > 0 {
for i, t := range conf.Templates {
templateFiles[i] = filepath.Join(am.cfg.DataDir, "templates", userID, t)
templateFiles[i] = filepath.Join(am.cfg.TenantDataDir, templatesDir, t)
}
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/alertmanager/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,14 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {
// not to configured data dir, and on the flipside, it'll fail if we can't write
// to tmpDir. Ignoring both cases for now as they're ultra rare but will revisit if
// we see this in the wild.
tmpDir, err := ioutil.TempDir("", "validate-config")
userTempDir, err := ioutil.TempDir("", "validate-config-"+cfg.User)
if err != nil {
return err
}
defer os.RemoveAll(tmpDir)
defer os.RemoveAll(userTempDir)

for _, tmpl := range cfg.Templates {
_, err := createTemplateFile(tmpDir, cfg.User, tmpl.Filename, tmpl.Body)
_, err := storeTemplateFile(userTempDir, tmpl.Filename, tmpl.Body)
if err != nil {
level.Error(logger).Log("msg", "unable to create template file", "err", err, "user", cfg.User)
return fmt.Errorf("unable to create template file '%s'", tmpl.Filename)
Expand All @@ -169,7 +169,7 @@ func validateUserConfig(logger log.Logger, cfg alertspb.AlertConfigDesc) error {

templateFiles := make([]string, len(amCfg.Templates))
for i, t := range amCfg.Templates {
templateFiles[i] = filepath.Join(tmpDir, "templates", cfg.User, t)
templateFiles[i] = filepath.Join(userTempDir, t)
}

_, err = template.FromGlobs(templateFiles...)
Expand Down
Loading

0 comments on commit 32f1847

Please sign in to comment.