Reduce likelihood of data loss when remote endpoint has an outage (#401)

* reduce likelihood of data loss when remote_write has an outage * take out old config block * s/some/some new
grafana · Feb 17, 2021 · 23f037c · 23f037c
1 parent cca763b
commit 23f037c
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,13 @@ can be found at [#317](https://github.com/grafana/agent/issues/317).
 - [BUGFIX] Fixed a bug from v0.12.0 where the Loki installation script failed
   because positions_directory was not set. (@rfratto)
 
+- [BUGFIX] (#400) Reduce the likelihood of dataloss during a remote_write-side
+  outage by increasing the default wal_truncation_frequency to 60m and preventing
+  the WAL from being truncated if the last truncation timestamp hasn't changed.
+  This change increases the size of the WAL on average, and users may configure
+  a lower wal_truncation_frequency to deliberately choose a smaller WAL over
+  write guarantees. (@rfratto)
+
 # v0.12.0 (2021-02-05)
 
 BREAKING CHANGES: This release has two breaking changes in the configuration

diff --git a/docs/configuration-reference.md b/docs/configuration-reference.md
@@ -372,13 +372,14 @@ host_filter_relabel_configs:
   [ - <relabel_config> ... ]
 
 # How frequently the WAL truncation process should run. Every iteration of
-# truncation will checkpoint old series, create a new segment for new samples,
-# and remove old samples that have been succesfully sent via remote_write.
-# If there are are multiple remote_write endpoints, the endpoint with the
-# earliest timestamp is used for the cutoff period, ensuring that no data
-# gets truncated until all remote_write configurations have been able to
-# send the data.
-[wal_truncate_frequency: <duration> | default = "1m"]
+# the truncation will checkpoint old series and remove old samples. If data
+# has not been sent within this window, some of it may be lost.
+#
+# The size of the WAL will increase with less frequent truncations. Making
+# truncations more frequent reduces the size of the WAL but increases the
+# chances of data loss when remote_write is failing for longer than the
+# specified frequency.
+[wal_truncate_frequency: <duration> | default = "60m"]
 
 # The minimum amount of time that series and samples should exist in the WAL
 # before being considered for deletion. The consumed disk space of the WAL will

diff --git a/pkg/prom/instance/instance.go b/pkg/prom/instance/instance.go
@@ -44,7 +44,7 @@ var (
 var (
 	DefaultConfig = Config{
 		HostFilter:           false,
-		WALTruncateFrequency: 1 * time.Minute,
+		WALTruncateFrequency: 60 * time.Minute,
 		MinWALTime:           5 * time.Minute,
 		MaxWALTime:           4 * time.Hour,
 		RemoteFlushDeadline:  1 * time.Minute,
@@ -627,6 +627,10 @@ func (i *Instance) newDiscoveryManager(ctx context.Context, cfg *Config) (*disco
 }
 
 func (i *Instance) truncateLoop(ctx context.Context, wal walStorage, cfg *Config) {
+	// Track the last timestamp we truncated for to prevent segments from getting
+	// deleted until at least some new data has been sent.
+	var lastTs int64 = math.MinInt64
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -654,6 +658,12 @@ func (i *Instance) truncateLoop(ctx context.Context, wal walStorage, cfg *Config
 				ts = maxTS
 			}
 
+			if ts == lastTs {
+				level.Debug(i.logger).Log("msg", "not truncating the WAL, remote_write timestamp is unchanged", "ts", ts)
+				continue
+			}
+			lastTs = ts
+
 			level.Debug(i.logger).Log("msg", "truncating the WAL", "ts", ts)
 			err := wal.Truncate(ts)
 			if err != nil {