From 769bbb78ad5e845de5847dd7b4f392c953505a4f Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Wed, 15 Feb 2023 13:44:08 +0000 Subject: [PATCH 01/14] Add WAL documentation. Also fix some minor metrics registration details --- agent/consul/server_log_verification.go | 4 +- agent/setup.go | 41 ++ .../docs/agent/config/config-files.mdx | 101 ++++- website/content/docs/agent/telemetry.mdx | 28 +- .../testing-experimental-raft-backend.mdx | 420 ++++++++++++++++++ website/data/docs-nav-data.json | 4 + 6 files changed, 583 insertions(+), 15 deletions(-) create mode 100644 website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx diff --git a/agent/consul/server_log_verification.go b/agent/consul/server_log_verification.go index 0c7e63e3a12c..cb95b9aeeee8 100644 --- a/agent/consul/server_log_verification.go +++ b/agent/consul/server_log_verification.go @@ -62,12 +62,12 @@ func makeLogVerifyReportFn(logger hclog.Logger) verifier.ReportFn { if r.WrittenSum > 0 && r.WrittenSum != r.ExpectedSum { // The failure occurred before the follower wrote to the log so it // must be corrupted in flight from the leader! - l2.Info("verification checksum FAILED: in-flight corruption", + l2.Error("verification checksum FAILED: in-flight corruption", "followerWriteChecksum", fmt.Sprintf("%08x", r.WrittenSum), "readChecksum", fmt.Sprintf("%08x", r.ReadSum), ) } else { - l2.Info("verification checksum FAILED: storage corruption", + l2.Error("verification checksum FAILED: storage corruption", "followerWriteChecksum", fmt.Sprintf("%08x", r.WrittenSum), "readChecksum", fmt.Sprintf("%08x", r.ReadSum), ) diff --git a/agent/setup.go b/agent/setup.go index 01d7b7593f62..fd4be3af29a1 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -9,6 +9,8 @@ import ( "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/go-hclog" + wal "github.com/hashicorp/raft-wal" + "github.com/hashicorp/raft-wal/verifier" "google.golang.org/grpc/grpclog" autoconf "github.com/hashicorp/consul/agent/auto-config" @@ -270,6 +272,22 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau consul.LeaderPeeringMetrics, xdscapacity.StatsGauges, ) + + verifierGauges := make([]prometheus.GaugeDefinition, 0) + for _, d := range verifier.MetricDefinitions.Gauges { + verifierGauges = append(verifierGauges, prometheus.GaugeDefinition{ + Name: []string{"raft", "logstore", "verifier", d.Name}, + Help: d.Desc, + }) + } + walGauges := make([]prometheus.GaugeDefinition, 0) + for _, d := range wal.MetricDefinitions.Gauges { + walGauges = append(walGauges, prometheus.GaugeDefinition{ + Name: []string{"raft", "wal", d.Name}, + Help: d.Desc, + }) + } + gauges = append(gauges, verifierGauges, walGauges) } // Flatten definitions @@ -316,6 +334,29 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau raftCounters, rate.Counters, } + + // For some unknown reason, we seem to add the raft counters above without + // checking if this is a server like we do above for some of the summaries + // above. We should probably fix that but I want to not change behavior right + // now. If we are a server, add summaries for WAL and verifier metrics. + if isServer { + verifierCounters := make([]prometheus.CounterDefinition, 0) + for _, d := range verifier.MetricDefinitions.Counters { + verifierCounters = append(verifierCounters, prometheus.CounterDefinition{ + Name: []string{"raft", "logstore", "verifier", d.Name}, + Help: d.Desc, + }) + } + walCounters := make([]prometheus.CounterDefinition, 0) + for _, d := range wal.MetricDefinitions.Counters { + walCounters = append(walCounters, prometheus.CounterDefinition{ + Name: []string{"raft", "wal", d.Name}, + Help: d.Desc, + }) + } + counters = append(counters, verifierCounters, walCounters) + } + // Flatten definitions // NOTE(kit): Do we actually want to create a set here so we can ensure definition names are unique? var counterDefs []prometheus.CounterDefinition diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index 82053a935674..84cc31fdaf24 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1586,15 +1586,98 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." ## Raft Parameters -- `raft_boltdb` ((#raft_boltdb)) This is a nested object that allows configuring - options for Raft's BoltDB based log store. - - - `NoFreelistSync` ((#NoFreelistSync)) Setting this to `true` will disable - syncing the BoltDB freelist to disk within the raft.db file. Not syncing - the freelist to disk will reduce disk IO required for write operations - at the expense of potentially increasing start up time due to needing - to scan the db to discover where the free space resides within the file. - +- `raft_boltdb` ((#raft_boltdb)) **These fields are deprecated in Consul 1.15.0. + See the [`raft_logstore`](#raft_logstore) instead.** This is a nested + object that allows configuring options for Raft's BoltDB based log store. + + - `NoFreelistSync` **This field is deprecated in Consul 1.15.0. See the + [`raft_logstore.boltdb.no_freelist_sync`](#raft_logstore_boltdb_no_freelist_sync) field + instead.** Setting this to `true` will disable syncing the BoltDB freelist + to disk within the raft.db file. Not syncing the freelist to disk will + reduce disk IO required for write operations at the expense of potentially + increasing start up time due to needing to scan the db to discover where the + free space resides within the file. + +- `raft_logstore` ((#raft_logstore)) This is a nested object that allows + configuring options for Raft's LogStore component which is used to persist + logs and crucial Raft state on disk during writes. This was added in Consul + 1.15. + + - `backend` ((#raft_logstore_backend)) This allows selection of which storage + engine to use to persist logs. Valid options are `boltdb` or `wal`. Default + is `boltdb`. As of Consul 1.15, `wal` is a new and experimental backend that + should be used with caution. See [our experimental WAL backend testing + guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) + to learn how to safely evaluate it for your workload. + + - `disable_log_cache` ((#raft_logstore_disable_log_cache)) This allows + disabling of the in-memory cache of recent logs. This exists mostly for + performance testing purposes. In theory the log cache prevents disk reads + for recent logs. In practice recent logs are still in OS page cache so tend + not to be slow to read using either backend. We recommend leaving it enabled + for now as we've not measured a significant improvement in any metric by + disabling. + + - `verification` ((#raft_logstore_verification)) This is a nested object that + allows configuring online verification of the LogStore. Verification + provides additional assurances that LogStore backends are correctly storing + data. It imposes very low overhead on servers and is safe to run in + production, however it's mostly useful when evaluating a new backend + implementation. + + Verification must be enabled on the leader to have any effect and can be + used with any backend. When enabled, the leader will periodically write a + special "checkpoint" log message including checksums of all log entries + written to Raft since the last checkpoint. Followers that have verification + enabled will run a background task for each checkpoint that reads all logs + directly from the LogStore and recomputes the checksum. A report is output + as an INFO level log for each checkpoint. + + Checksum failure should never happen and indicate unrecoverable corruption + on that server. The only correct response is to stop the server, remove its + data directory, and restart so it can be caught back up with a correct + server again. Please report verification failures including details about + your hardware and workload via GitHub issues. See [our experimental WAL + backend testing + guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) + for more details on using these to evaluate a new backend. + + - `enabled` ((#raft_logstore_verification_enabled)) - Setting this to `true` + will enable log verification checkpoints to be written (if leader) and + verified on this server. + + - `interval` ((#raft_logstore_verification_interval)) - The time interval + between checkpoints. There is no default so both `enabled` and `interval` + must be set explicitly to correctly enable. An interval of `30s` to `5m` + is most likely to be useful. The performance overhead should not be + significant at any level above a few seconds, it's mostly useful to + control how frequently the report logs appear for human observation. + + - `boltdb` ((#raft_logstore_boltdb)) - This is a nested object that allows + configuring options for Raft's `boltdb` backend. It has no effect if the + `backend` is not `boltdb`. + + - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Setting this + to `true` will disable syncing the BoltDB freelist to disk within the + raft.db file. Not syncing the freelist to disk will reduce disk IO required + for write operations at the expense of potentially increasing start up time + due to needing to scan the db to discover where the free space resides + within the file. + + - `wal` ((#raft_logstore_wal)) - This is a nested object that allows + configuring the `wal` backend. See [our experimental WAL backend testing + guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) + for more details on safely evaluating this backend. + + - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - This is an + integer representing the target size (in MB) for each segment file before + rolling to a new segment. The default is 64 and should be suitable for + almost all deployments. A smaller value may use less disk space since + it can be reclaimed by deleting old segments sooner at the expense of + performing a more costly operation to safely rotate to a new file more + frequently which could impact tail latencies. Larger values are unlikely + to improve performance significantly. This is mostly exposed for + performance testing purposes. - `raft_protocol` ((#raft_protocol)) Equivalent to the [`-raft-protocol` command-line flag](/consul/docs/agent/config/cli-flags#_raft_protocol). diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index 53d7ed917663..fb79f71d9dcb 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -294,7 +294,7 @@ This metric should be monitored to ensure that the license doesn't expire to pre | Metric Name | Description | Unit | Type | | :-------------------------------- | :--------------------------------------------------------------- | :---- | :---- | -| `consul.raft.boltdb.freelistBytes` | Represents the number of bytes necessary to encode the freelist metadata. When [`raft_boltdb.NoFreelistSync`](/consul/docs/agent/config/config-files#NoFreelistSync) is set to `false` these metadata bytes must also be written to disk for each committed log. | bytes | gauge | +| `consul.raft.boltdb.freelistBytes` | Represents the number of bytes necessary to encode the freelist metadata. When [`raft_logstore.boltdb.no_freelist_sync`](/consul/docs/agent/config/config-files#raft_logstore_boltdb_no_freelist_sync) is set to `false` these metadata bytes must also be written to disk for each committed log. | bytes | gauge | | `consul.raft.boltdb.logsPerBatch` | Measures the number of logs being written per batch to the db. | logs | sample | | `consul.raft.boltdb.storeLogs` | Measures the amount of time spent writing logs to the db. | ms | timer | | `consul.raft.boltdb.writeCapacity` | Theoretical write capacity in terms of the number of logs that can be written per second. Each sample outputs what the capacity would be if future batched log write operations were similar to this one. This similarity encompasses 4 things: batch size, byte size, disk performance and boltdb performance. While none of these will be static and its highly likely individual samples of this metric will vary, aggregating this metric over a larger time window should provide a decent picture into how this BoltDB store can perform | logs/second | sample | @@ -337,11 +337,15 @@ indicator of an actual issue, this metric can be used to diagnose why the `consu is high. If Bolt DB log storage performance becomes an issue and is caused by free list management then setting -[`raft_boltdb.NoFreelistSync`](/consul/docs/agent/config/config-files#NoFreelistSync) to `true` in the server's configuration +[`raft_logstore.boltdb.no_freelist_sync`](/consul/docs/agent/config/config-files#raft_logstore_boltdb_no_freelist_sync) to `true` in the server's configuration may help to reduce disk IO and log storage operation times. Disabling free list syncing will however increase the startup time for a server as it must scan the raft.db file for free space instead of loading the already populated free list structure. +~> Note that as of Consul 1.15 there is a [new experimental storage +backend](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend) +that can be trialed instead of BoltDB. + ## Metrics Reference @@ -418,7 +422,7 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.applied_index` | Represents the raft applied index. | index | gauge | | `consul.raft.apply` | Counts the number of Raft transactions occurring over the interval, which is a general indicator of the write load on the Consul servers. | raft transactions / interval | counter | | `consul.raft.barrier` | Counts the number of times the agent has started the barrier i.e the number of times it has issued a blocking call, to ensure that the agent has all the pending operations that were queued, to be applied to the agent's FSM. | blocks / interval | counter | -| `consul.raft.boltdb.freelistBytes` | Represents the number of bytes necessary to encode the freelist metadata. When [`raft_boltdb.NoFreelistSync`](/consul/docs/agent/config/config-files#NoFreelistSync) is set to `false` these metadata bytes must also be written to disk for each committed log. | bytes | gauge | +| `consul.raft.boltdb.freelistBytes` | Represents the number of bytes necessary to encode the freelist metadata. When [`raft_logstore.boltdb.no_freelist_sync`](/consul/docs/agent/config/config-files#raft_logstore_boltdb_no_freelist_sync) is set to `false` these metadata bytes must also be written to disk for each committed log. | bytes | gauge | | `consul.raft.boltdb.freePageBytes` | Represents the number of bytes of free space within the raft.db file. | bytes | gauge | | `consul.raft.boltdb.getLog` | Measures the amount of time spent reading logs from the db. | ms | timer | | `consul.raft.boltdb.logBatchSize` | Measures the total size in bytes of logs being written to the db in a single batch. | bytes | sample | @@ -452,6 +456,11 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.last_index` | Represents the raft applied index. | index | gauge | | `consul.raft.leader.dispatchLog` | Measures the time it takes for the leader to write log entries to disk. | ms | timer | | `consul.raft.leader.dispatchNumLogs` | Measures the number of logs committed to disk in a batch. | logs | gauge | +| `consul.raft.logstore.verifier.checkpoints_written` | Counts the number of checkpoint entries written to the LogStore. | checkpoints | counter | +| `consul.raft.logstore.verifier.dropped_reports` | Counts how many times the verifier routine was still busy when the next checksum came in and so verification for a range was skipped. If you see this happen, consider increasing the interval between checkpoints with [`raft_logstore.verification.interval`](http://localhost:3000/consul/docs/agent/config/config-files#raft_logstore_verification) | reports dropped | counter | +| `consul.raft.logstore.verifier.ranges_verified` | Counts the number of log ranges for which a verification report has been completed. See [monitoring experimental backends](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend#monitoring) for more information. | log ranges verifications | counter | +| `consul.raft.logstore.verifier.read_checksum_failures` | Counts the number of times a range of logs between two check points contained at least one disk corruption. See [monitoring experimental backends](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend#monitoring) for more information. | disk corruptions | counter | +| `consul.raft.logstore.verifier.write_checksum_failures` | Counts the number of times a follower has a different checksum to the leader at the point where it writes to the log. This could be caused by either a disk-corruption on the leader (unlikely) or some other corruption of the log entries in-flight. | in-flight corruptions | counter | | `consul.raft.leader.lastContact` | Measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.The lease timeout is 500 ms times the [`raft_multiplier` configuration](/consul/docs/agent/config/config-files#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/consul/docs/install/performance) guide for more details. | ms | timer | | `consul.raft.leader.oldestLogAge` | The number of milliseconds since the _oldest_ log in the leader's log store was written. This can be important for replication health where write rate is high and the snapshot is large as followers may be unable to recover from a restart if restoring takes longer than the minimum value for the current leader. Compare this with `consul.raft.fsm.lastRestoreDuration` and `consul.raft.rpc.installSnapshot` to monitor. In normal usage this gauge value will grow linearly over time until a snapshot completes on the leader and the log is truncated. Note: this metric won't be emitted until the leader writes a snapshot. After an upgrade to Consul 1.10.0 it won't be emitted until the oldest log was written after the upgrade. | ms | gauge | | `consul.raft.replication.heartbeat` | Measures the time taken to invoke appendEntries on a peer, so that it doesn't timeout on a periodic basis. | ms | timer | @@ -476,7 +485,18 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.state.follower` | Counts the number of times an agent has entered the follower mode. This happens when a new agent joins the cluster or after the end of a leader election. | follower state entered / interval | counter | | `consul.raft.transition.heartbeat_timeout` | The number of times an agent has transitioned to the Candidate state, after receive no heartbeat messages from the last known leader. | timeouts / interval | counter | | `consul.raft.verify_leader` | This metric doesn't have a direct correlation to the leader change. It just counts the number of times an agent checks if it is still the leader or not. For example, during every consistent read, the check is done. Depending on the load in the system, this metric count can be high as it is incremented each time a consistent read is completed. | checks / interval | Counter | -| `consul.rpc.accept_conn` | Increments when a server accepts an RPC connection. | connections | counter | +| `consul.raft.wal.head_truncations` | Counts how many log entries have been truncated from the head - i.e. the oldest entries. by graphing the rate of change over time you can see individual truncate calls as spikes. | logs entries truncated | counter | +| `consul.raft.wal.last_segment_age_seconds` | A gauge that is set each time we rotate a segment and describes the number of seconds between when that segment file was first created and when it was sealed. this gives a rough estimate how quickly writes are filling the disk. | seconds | gauge | +| `consul.raft.wal.log_appends` | Counts the number of calls to StoreLog(s) i.e. number of batches of entries appended. | calls | counter | +| `consul.raft.wal.log_entries_read` | Counts the number of log entries read. | log entries read | counter | +| `consul.raft.wal.log_entries_written` | Counts the number of log entries written. | log entries written | counter | +| `consul.raft.wal.log_entry_bytes_read` | Counts the bytes of log entry read from segments before decoding. actual bytes read from disk might be higher as it includes headers and index entries and possible secondary reads for large entries that don't fit in buffers. | bytes | counter | +| `consul.raft.wal.log_entry_bytes_written` | Counts the bytes of log entry after encoding with Codec. Actual bytes written to disk might be slightly higher as it includes headers and index entries. | bytes | counter | +| `consul.raft.wal.segment_rotations` | Counts how many times we move to a new segment file. | rotations | counter | +| `consul.raft.wal.stable_gets` | Counts how many calls to StableStore.Get or GetUint64. | calls | counter | +| `consul.raft.wal.stable_sets` | Counts how many calls to StableStore.Set or SetUint64. | calls | counter | +| `consul.raft.wal.tail_truncations` | Counts how many log entries have been truncated from the head - i.e. the newest entries. by graphing the rate of change over time you can see individual truncate calls as spikes. | logs entries truncated | counter | +| `consul.rpc.accept_conn` | Increments when a server accepts an RPC connection. | connections | counter | | `consul.rpc.rate_limit.exceeded` | Increments whenever an RPC is over a configured rate limit. In permissive mode, the RPC is still allowed to proceed. | RPCs | counter | | `consul.rpc.rate_limit.log_dropped` | Increments whenever a log that is emitted because an RPC exceeded a rate limit gets dropped because the output buffer is full. | log messages dropped | counter | | `consul.catalog.register` | Measures the time it takes to complete a catalog register operation. | ms | timer | diff --git a/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx b/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx new file mode 100644 index 000000000000..e5360eab0918 --- /dev/null +++ b/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx @@ -0,0 +1,420 @@ +--- +layout: docs +page_title: Testing the Experimental WAL LogStore Backend +description: >- + Consul 1.15 introduced a new experimental storage backend option. Learn how to + configure and test it out in a safe way. +--- + +# Testing the Experimental WAL LogStore Backend + +## Introduction + +Consul 1.15 introduced a new experimental storage backend. This guide explains +how to configure it and test it out safely. The new backend is called `wal` in +configuration. + +WAL is a acronym for "Write-Ahead Log". We called it this because it implements +a traditional log with rotating, append-only log files. The current `LogStore` +uses BoltDB which is a copy-on-write BTree which is less optimized for +append-only workloads. + +~> The `wal` backend is considered **experimental** in Consul 1.15. Please test +it safely in pre-production first and verify on a subset of servers in +production using this guide before fully enabling it on all servers. + +## Why build a new backend? + +The WAL backend has been written to resolve some long-standing issues with the +current BoltDB backend. The existing BoltDB log store has worked reliably for +most users for years, however it is not the most efficient way to store +append-only logs to disk since it was designed as a full key-value database. It +was an expedient option when our raft library was first written and always +assumed we'd replace it with something more purpose-built. + +Importantly, a BoltDB database is a single file that only ever grows. Deleting +the oldest logs which we do regularly when we've made a new snapshots of the +state, leaves free space in the file that needs to be tracked to be re-used on +future writes. By contrast a simple segmented log can just delete the oldest log +files from disk. With BoltDB uses as a log, sudden burst of writes say three +times larger than the normal volume can suddenly cause the file to grow to about +several times it's steady-state size. After the next snapshot is taken, and the +oldest logs truncated again, the file is left as mostly empty space. Tracking +this free space requires writing extra metadata proportional to the amount of +free pages to disk with every write and so after such a burst, write latencies +tend to increase - in some cases dramatically causing serious performance +degradation to the cluster. + +Even if this has never happened to a catastrophic degree in a cluster, the fact +that it's a risk has meant that Consul has erred on the side of never letting +too many logs accumulate in the LogStore. Significantly larger BoltDB files are +somewhat slower in general because it's a tree and so still has log(N) work to +do n every write. But out user's experience showed that the larger the file, the +more likely it is to have a large freelist or suddenly form one after a burst of +writes. For this reason, the default options for how frequently we make a full +snapshot and truncate the logs, and for how many logs we keep around have always +been aggressively set towards keeping BoltDB small rather than using disk IO the +most efficiently. + +Other reliability issues such as [followers being unable to catch +up](/consul/docs/agent/telemetry#raft-replication-capacity-issues) also stem +from this need to carefully balance the size of the BoltDB log store against how +long snapshots take to restore - there is a simple solution to that issue if +letting logs grow much larger to ensure recovery didn't have a potentially +catastrophic impact on the cluster's write performance. + +While not every user will experience a huge difference in performance, the WAL +backend avoids these performance concerns entirely. It is more performant when +directly measured due to solving a simpler storage problem than BoltDB was +designed for. For example it can commit a single log entry with on fsync instead +of two, and tends to write 2-3x fewer bytes to the disk to do it. The real +benefit though is that retaining more logs won't impact write performance at all +and so reducing disk IO with slower snapshots or keeping logs around to catch up +slower followers are all possible. + +## Why try it out? + +The new WAL backend has been tested thoroughly during development: + * Every component in the WAL like [metadata + management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), + [log file + encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) + to actual [file-system + interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) + was abstracted so unit tests can simulate all sorts of difficult-to-reproduce + disk failures. + * We [used + ALICE](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md), to + exhaustively simulate thousands of possible crash failure scenarios and test + that WAL correctly recovered from each. + * We ran hundreds of tests over a few weeks in a performance testing cluster + with checksum verification enabled and detected zero cases of data loss or + corruption. We plan to continue testing this continuously over the next few + months too before making it the default backend. + +However, we are well aware of both how complex and how critical disk-persistence +is for our user's data. + +Our hope is that we will have many users at all degrees of scale try WAL in +their environments after upgrading to 1.15 and report success or failure back so +we have increased confidence before we make it the default for new clusters. + +This guide describes how to safely try and verify it without risking the +availability of your cluster should there be a latent data-loss issue +discovered. + +## Requirements + +- All servers in the Datacenter should be upgraded to Consul 1.15 using the + [standard upgrade procedure](/consul/docs/upgrading/general-process) and + the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x). +- You need a Consul cluster with at least 3 nodes to safely test the new + backend without downtime. + +## Assumptions + +This guide makes the following assumptions: + +- You have taken a snapshot prior to testing in case things don't go to plan. +- You have the ability to monitor Consul server metrics and logs, and ideally + set an alert on specific log events occurring. +- We assume that you will try this in a pre-production environment first and + leave it running for a few days or weeks to gain confidence before attempting + it in production. + +## Risks + +Although we are describing a way to test this that limits risk as far as +possible, there still are potential risks it's important to know: + + - If WAL is enabled on a server and is found to corrupt data in some way. That + server's data can't be recovered. The server will need to be restarted with + an empty data directory and reload it's state from the leader. + - It's possible that WAL might corrupt data or otherwise have a bug that causes + the server to panic and crash. It may even not be able to restart if the same + bug occurs when it reads from the logs on startup. In this case as above the + server would need to be restarted with WAL disabled and an empty data + directory. + - It's _possible_ though unlikely that if WAL corrupted data, clients might + read corrupted data from that server. For example invalid IP addresses or + have tokens fail to match. This is unlikely even if there is a WAL corruption + bug hit because replication typically takes place using in-memory cached + objects rather than reads from disk. The issue would be fixed by restoring + the server. + - If you enable a server to use WAL using Consul OSS or on a voting server with + Consul Enterprise, it's _possible_ that the WAL could cause corruption of + that server's state (with the caveats above) _and then_ become the leader and + replicate that corruption to all other servers. In this scenario only a + restore from backup would recover a completely un-corrupt state. + - If you test on a non-voting server in Enterprise, this can't happen. + +## Procedure to enable WAL on one server + +**1. Enable log verification (new in 1.15).** + +This needs to be enabled on any voting server in Enterprise and all servers in +OSS since it is the leader that writes verification checkpoints. + +On each voting server add the following to the server's configuration file: + +```hcl +raft_logstore { + verification { + enabled = true + interval = "60s" + } +} +``` + +You'll need to restart each server in turn for that to take effect (`consul +reload` is not sufficient). Wait for each one to become a healthy voter again +using `consul operator raft list-peers` before moving on to the next. This can +take a few minutes if the snapshot is large. + +You should now see log entries on the servers every minute that look like this: + +``` +2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c +``` + + +**2. Select a server to enable WAL on.** + +As noted in [Risks](#risks), Consul Enterprise users should select a non-voting +server at first. For Consul OSS users, or Enterprise users who don't have +non-voting servers, select one of the follower servers. + +You can find the current state of the servers by running. + +```shell-session +$ consul operator raft list-peers +``` + +We'll refer to this server as the "target" server from now on. + + +**3. Stop the target server gracefully** + +For example using `systemctl stop consul` if using systemd. + +~> If you have any configuration management automation like Chef or Puppet that +might interfere with this process, they must be disabled until the process is +complete. + + +**4. Remove the data directory from the target server.** + +Moving rather than deleting is less destructive in case of command errors. Once +the server has restarted though, the data in the old dir should not be used for +recovery and will eventually need to be removed entirely. + +Replace `/data-dir` with whatever is set in `data_dir` in your configuration +file. + +```shell-session +$ mv /data-dir/raft /data-dir/raft.bak +``` + + +**5. Update the target server's configuration.** + +Add the following to the target server's configuration file: + +```hcl +raft_logstore { + backend = "wal" + verification { + enabled = true + interval = "60s" + } +} +``` + +**6. Start the target server.** + +For example with `systemctl start consul` if using systemd. + +**7. Monitor target server raft metrics and logs.** + +See the section below on [monitoring WAL tests](#monitoring-wal-tests). + +We suggest you leave the cluster in this configuration for days or weeks, +assuming that you see no negative metrics or verification errors in logs to +increase confidence in the WAL under varying workloads and during routine server +restarts etc. + +~> If you disabled Chef, Puppet or similar earlier, you may want to consider +enabling it again while the test runs. Ensure that it will not "fix" the Consul +configuration file and remove the different backend though. + +**Next Steps.** + + * If you see any verification errors, performance anomalies or other suspicious +behavior from the target server during the test, you should follow [the +procedure to revert back to BoltDB](#procedure-to-revert-to-boltdb). + + * If you see no errors and would like to expand the test further, you can repeat +the above procedure on another target server. We suggest waiting a while after +each and slowly rolling out. Once the majority of your servers are using WAL any +bugs not yet found could result in cluster unavailability. + + * If you wish to permanently enable `wal` on all servers, you'll need to follow +the above steps on each one. Even if `backend = "wal"` is set in logs, servers +will continue to use BoltDB if they find an existing raft.db file in the data +dir. + +~> When switching backend, you must always remove the _whole raft directory_ +not just the `raft.db` file or `wal` directory since the log must always be +consistent with the snapshots to avoid undefined behavior or data loss. + +## Procedure to revert to BoltDB + +To revert a server that is using `wal` to using BoltDB, the steps are +essentially the same as the procedure above. + +**1. Stop the target server gracefully** + +For example using `systemctl stop consul` if using systemd. + +~> If you have any configuration management automation like Chef or Puppet that +might interfere with this process, they must be disabled until the process is +complete. + + +**2. Remove the data directory from the target server.** + +Moving rather than deleting is less destructive in case of command errors. Once +the server has restarted though, the data in the old dir should not be used for +recovery and will eventually need to be removed entirely. + +Replace `/data-dir` with whatever is set in `data_dir` in your configuration +file. + +```shell-session +$ mv /data-dir/raft /data-dir/raft.bak +``` + + +**3. Update the target server's configuration.** + +Modify the `backend` in the target server's configuration file: + +```hcl +raft_logstore { + backend = "boltdb" + verification { + enabled = true + interval = "60s" + } +} +``` + +**4. Start the target server.** + +For example with `systemctl start consul` if using systemd. + +**5. Watch for the server to become a healthy voter again.** + +```shell-session +$ consul operator raft list-peers +``` + +**6. If necessary, cleanup any `raft.bak` directories.** + +```shell-session +$ rm /data-dir/raft.bak +``` + +## Monitoring + +Throughout the testing period, it's important to monitor the cluster and +especially the target server for signals that the WAL is not performing properly +or is behaving incorrectly. + +### Monitor for checksum failures + +If the log store verification fails on any server (whether it's running BoltDB +or WAL backed), that is an **unrecoverable error**. It will look something like +this in the logs: + +#### Read Failures: Disk Corruption + +``` +2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: storage corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... readChecksum=0x45... +``` +This indicates that the server read back different data to what it wrote to disk +which signals corruption in the storage backend or filesystem. + +For convenience we also increment a metric +`consul.raft.logstore.verifier.read_checksum_failures` when this occurs. + +#### Write Failures: In-flight Corruption + +It's also possible that you might see a different kind of checksum error: + +``` +2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: in-flight corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... followerWriteChecksum=0x45... +``` + +This indicates that the checksum on the follower didn't match the leader when it +_wrote_ the logs which implies that the corruption happened in the network or +software and not the log store. This likely doesn't indicate an issue with the +storage backend but should be handled the same way. + +For convenience we also increment a metric +`consul.raft.logstore.verifier.write_checksum_failures` when this occurs. + +#### Handling Checksum Failures + +If either type of corruption is detected, the only safe way to handle it is to +follow the [revert to BoltDB procedure](#procedure-to-revert-to-boltdb). If the +server is already using BoltDB, the same is true although this is likely to +indicate a latent bug in BoltDB or a bug in our verification code that needs to +be investigated. + +Please report all verification failures via a [GitHub +issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). + +It would really help us if you can include: + - Details of your server cluster configuration and hardware + - Logs around the failure message + - Context for how long they have been running the configuration + - Any metrics or description of the workload you have, e.g. how many raft + commits per second as well as the performance metrics described below + +~> We recommend setting up an alert on Consul server logs containing +`verification checksum FAILED` or on the +`consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The +sooner a corrupt server is handled, the lower the chance of any of the +[potential risks](#risks) causing problems in your cluster. + +### Performance Metrics + +The key performance metrics to watch are: + +* `consul.raft.commitTime` measures the time to commit new writes on a quorum of + servers. It should be the same or lower after deploying WAL. Even if WAL is + faster for your workload and hardware, it may not be reflected in commitTime + until enough followers are using it that the leader doesn't have to wait for a + slower one (one in a cluster of three, two in a cluster of five etc.). + +* `consul.raft.rpc.appendEntries.storeLogs` measures the time spent persisting + logs to disk on each _follower_. It should be the same or lower for + WAL-enabled followers. + +* `consul.raft.replication.appendEntries.rpc` measures the time taken for each + `AppendEntries` RPC from the leader's perspective. If this is significantly + higher than `consul.raft.rpc.appendEntries` on the follower, it indicates a + known queuing issue in our raft library that will be fixed soon and is + essentially unrelated to the backend. The follower(s) with WAL enabled should + not be slower than the others. You can work out which follower each metric is + for by matching the `peer_id` label value to the server IDs listed by `consul + operator raft list-peers`. + +* `consul.raft.compactLogs` measures the time take to truncate the logs after a + snapshot. WAL-enabled servers should not be slower than BoltDB ones. + +* `consul.raft.leader.dispatchLog` measures the time spent persisting logs to + disk on the _leader_. It is only relevant if a WAL-enabled server becomes a + leader. It should be the same or lower than before when the leader was using + BoltDB. \ No newline at end of file diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 16bf17d84baf..c5e219db19ac 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -294,6 +294,10 @@ { "title": "Upgrading to Latest 1.10.x", "path": "upgrading/instructions/upgrade-to-1-10-x" + }, + { + "title": "Consul 1.15: Experimental WAL LogStore", + "path": "upgrading/instructions/testing-experimental-raft-backend" } ] } From 864b8e83a1e54b98b072d4198a0c2d97bae9b354 Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Wed, 15 Feb 2023 14:23:00 +0000 Subject: [PATCH 02/14] Add tests to verify metrics are registered correctly --- agent/metrics_test.go | 190 ++++++++++++++++++++++++++++++++++++++++++ agent/setup.go | 26 ++++-- 2 files changed, 208 insertions(+), 8 deletions(-) diff --git a/agent/metrics_test.go b/agent/metrics_test.go index 1f649dd07a5b..6f75a4d233b3 100644 --- a/agent/metrics_test.go +++ b/agent/metrics_test.go @@ -432,3 +432,193 @@ func TestHTTPHandlers_AgentMetrics_CACertExpiry_Prometheus(t *testing.T) { }) } + +func TestHTTPHandlers_AgentMetrics_WAL_Prometheus(t *testing.T) { + skipIfShortTesting(t) + // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance + + t.Run("client agent emits nothing", func(t *testing.T) { + hcl := ` + server = false + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_4" + } + raft_logstore { + backend = "wal" + } + bootstrap = false + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + require.NotContains(t, respRec.Body.String(), "agent_4_raft_wal") + }) + + t.Run("server with WAL enabled emits WAL metrics", func(t *testing.T) { + hcl := ` + server = true + bootstrap = true + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_5" + } + connect { + enabled = true + } + raft_logstore { + backend = "wal" + } + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + testrpc.WaitForLeader(t, a.RPC, "dc1") + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + out := respRec.Body.String() + require.Contains(t, out, "agent_5_raft_wal_head_truncations") + require.Contains(t, out, "agent_5_raft_wal_last_segment_age_seconds") + require.Contains(t, out, "agent_5_raft_wal_log_appends") + require.Contains(t, out, "agent_5_raft_wal_log_entries_read") + require.Contains(t, out, "agent_5_raft_wal_log_entries_written") + require.Contains(t, out, "agent_5_raft_wal_log_entry_bytes_read") + require.Contains(t, out, "agent_5_raft_wal_log_entry_bytes_written") + require.Contains(t, out, "agent_5_raft_wal_segment_rotations") + require.Contains(t, out, "agent_5_raft_wal_stable_gets") + require.Contains(t, out, "agent_5_raft_wal_stable_sets") + require.Contains(t, out, "agent_5_raft_wal_tail_truncations") + }) + + t.Run("server without WAL enabled emits no WAL metrics", func(t *testing.T) { + hcl := ` + server = true + bootstrap = true + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_6" + } + connect { + enabled = true + } + raft_logstore { + backend = "boltdb" + } + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + testrpc.WaitForLeader(t, a.RPC, "dc1") + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + require.NotContains(t, respRec.Body.String(), "agent_6_raft_wal") + }) + +} + +func TestHTTPHandlers_AgentMetrics_LogVerifier_Prometheus(t *testing.T) { + skipIfShortTesting(t) + // This test cannot use t.Parallel() since we modify global state, ie the global metrics instance + + t.Run("client agent emits nothing", func(t *testing.T) { + hcl := ` + server = false + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_4" + } + raft_logstore { + verification { + enabled = true + interval = "1s" + } + } + bootstrap = false + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + require.NotContains(t, respRec.Body.String(), "agent_4_raft_logstore_verifier") + }) + + t.Run("server with verifier enabled emits all metrics", func(t *testing.T) { + hcl := ` + server = true + bootstrap = true + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_5" + } + connect { + enabled = true + } + raft_logstore { + verification { + enabled = true + interval = "1s" + } + } + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + testrpc.WaitForLeader(t, a.RPC, "dc1") + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + out := respRec.Body.String() + require.Contains(t, out, "agent_5_raft_logstore_verifier_checkpoints_written") + require.Contains(t, out, "agent_5_raft_logstore_verifier_dropped_reports") + require.Contains(t, out, "agent_5_raft_logstore_verifier_ranges_verified") + require.Contains(t, out, "agent_5_raft_logstore_verifier_read_checksum_failures") + require.Contains(t, out, "agent_5_raft_logstore_verifier_write_checksum_failures") + }) + + t.Run("server with verifier disabled emits no extra metrics", func(t *testing.T) { + hcl := ` + server = true + bootstrap = true + telemetry = { + prometheus_retention_time = "5s", + disable_hostname = true + metrics_prefix = "agent_6" + } + connect { + enabled = true + } + raft_logstore { + verification { + enabled = false + } + } + ` + + a := StartTestAgent(t, TestAgent{HCL: hcl}) + defer a.Shutdown() + testrpc.WaitForLeader(t, a.RPC, "dc1") + + respRec := httptest.NewRecorder() + recordPromMetrics(t, a, respRec) + + require.NotContains(t, respRec.Body.String(), "agent_6_raft_logstore_verifier") + }) + +} diff --git a/agent/setup.go b/agent/setup.go index fd4be3af29a1..8dc5e5e18c06 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -91,7 +91,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl } isServer := result.RuntimeConfig.ServerMode - gauges, counters, summaries := getPrometheusDefs(cfg.Telemetry, isServer) + gauges, counters, summaries := getPrometheusDefs(cfg, isServer) cfg.Telemetry.PrometheusOpts.GaugeDefinitions = gauges cfg.Telemetry.PrometheusOpts.CounterDefinitions = counters cfg.Telemetry.PrometheusOpts.SummaryDefinitions = summaries @@ -228,7 +228,7 @@ func newConnPool(config *config.RuntimeConfig, logger hclog.Logger, tls *tlsutil // getPrometheusDefs reaches into every slice of prometheus defs we've defined in each part of the agent, and appends // all of our slices into one nice slice of definitions per metric type for the Consul agent to pass to go-metrics. -func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) { +func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.GaugeDefinition, []prometheus.CounterDefinition, []prometheus.SummaryDefinition) { // TODO: "raft..." metrics come from the raft lib and we should migrate these to a telemetry // package within. In the mean time, we're going to define a few here because they're key to monitoring Consul. raftGauges := []prometheus.GaugeDefinition{ @@ -272,7 +272,9 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau consul.LeaderPeeringMetrics, xdscapacity.StatsGauges, ) + } + if isServer && cfg.RaftLogStoreConfig.Verification.Enabled { verifierGauges := make([]prometheus.GaugeDefinition, 0) for _, d := range verifier.MetricDefinitions.Gauges { verifierGauges = append(verifierGauges, prometheus.GaugeDefinition{ @@ -280,6 +282,11 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau Help: d.Desc, }) } + gauges = append(gauges, verifierGauges) + } + + if isServer && cfg.RaftLogStoreConfig.Backend == consul.LogStoreBackendWAL { + walGauges := make([]prometheus.GaugeDefinition, 0) for _, d := range wal.MetricDefinitions.Gauges { walGauges = append(walGauges, prometheus.GaugeDefinition{ @@ -287,7 +294,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau Help: d.Desc, }) } - gauges = append(gauges, verifierGauges, walGauges) + gauges = append(gauges, walGauges) } // Flatten definitions @@ -298,7 +305,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau // TODO(kit): Prepending the service to each definition should be handled by go-metrics var withService []prometheus.GaugeDefinition for _, gauge := range g { - gauge.Name = append([]string{cfg.MetricsPrefix}, gauge.Name...) + gauge.Name = append([]string{cfg.Telemetry.MetricsPrefix}, gauge.Name...) withService = append(withService, gauge) } gaugeDefs = append(gaugeDefs, withService...) @@ -339,7 +346,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau // checking if this is a server like we do above for some of the summaries // above. We should probably fix that but I want to not change behavior right // now. If we are a server, add summaries for WAL and verifier metrics. - if isServer { + if isServer && cfg.RaftLogStoreConfig.Verification.Enabled { verifierCounters := make([]prometheus.CounterDefinition, 0) for _, d := range verifier.MetricDefinitions.Counters { verifierCounters = append(verifierCounters, prometheus.CounterDefinition{ @@ -347,6 +354,9 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau Help: d.Desc, }) } + counters = append(counters, verifierCounters) + } + if isServer && cfg.RaftLogStoreConfig.Backend == consul.LogStoreBackendWAL { walCounters := make([]prometheus.CounterDefinition, 0) for _, d := range wal.MetricDefinitions.Counters { walCounters = append(walCounters, prometheus.CounterDefinition{ @@ -354,7 +364,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau Help: d.Desc, }) } - counters = append(counters, verifierCounters, walCounters) + counters = append(counters, walCounters) } // Flatten definitions @@ -364,7 +374,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau // TODO(kit): Prepending the service to each definition should be handled by go-metrics var withService []prometheus.CounterDefinition for _, counter := range c { - counter.Name = append([]string{cfg.MetricsPrefix}, counter.Name...) + counter.Name = append([]string{cfg.Telemetry.MetricsPrefix}, counter.Name...) withService = append(withService, counter) } counterDefs = append(counterDefs, withService...) @@ -418,7 +428,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig, isServer bool) ([]prometheus.Gau // TODO(kit): Prepending the service to each definition should be handled by go-metrics var withService []prometheus.SummaryDefinition for _, summary := range s { - summary.Name = append([]string{cfg.MetricsPrefix}, summary.Name...) + summary.Name = append([]string{cfg.Telemetry.MetricsPrefix}, summary.Name...) withService = append(withService, summary) } summaryDefs = append(summaryDefs, withService...) From 56a554aa28d80878f282d70177dbab1e286dd62d Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Wed, 22 Feb 2023 22:13:56 -0800 Subject: [PATCH 03/14] refactor and move wal docs --- .../docs/agent/config/config-files.mdx | 84 ++-- website/content/docs/agent/telemetry.mdx | 13 +- .../docs/agent/wal-logstore/enable.mdx | 143 ++++++ .../content/docs/agent/wal-logstore/index.mdx | 58 +++ .../docs/agent/wal-logstore/monitoring.mdx | 86 ++++ .../agent/wal-logstore/revert-to-boltdb.mdx | 89 ++++ .../testing-experimental-raft-backend.mdx | 420 ------------------ website/data/docs-nav-data.json | 25 +- 8 files changed, 444 insertions(+), 474 deletions(-) create mode 100644 website/content/docs/agent/wal-logstore/enable.mdx create mode 100644 website/content/docs/agent/wal-logstore/index.mdx create mode 100644 website/content/docs/agent/wal-logstore/monitoring.mdx create mode 100644 website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx delete mode 100644 website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index 5ad15a3929eb..9a2df3464a37 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1587,10 +1587,10 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." ## Raft Parameters - `raft_boltdb` ((#raft_boltdb)) **These fields are deprecated in Consul 1.15.0. - See the [`raft_logstore`](#raft_logstore) instead.** This is a nested + Use [`raft_logstore`](#raft_logstore) instead.** This is a nested object that allows configuring options for Raft's BoltDB based log store. - - `NoFreelistSync` **This field is deprecated in Consul 1.15.0. See the + - `NoFreelistSync` **This field is deprecated in Consul 1.15.0. Use the [`raft_logstore.boltdb.no_freelist_sync`](#raft_logstore_boltdb_no_freelist_sync) field instead.** Setting this to `true` will disable syncing the BoltDB freelist to disk within the raft.db file. Not syncing the freelist to disk will @@ -1603,12 +1603,12 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." logs and crucial Raft state on disk during writes. This was added in Consul 1.15. - - `backend` ((#raft_logstore_backend)) This allows selection of which storage + - `backend` ((#raft_logstore_backend)) Specifies which storage engine to use to persist logs. Valid options are `boltdb` or `wal`. Default - is `boltdb`. As of Consul 1.15, `wal` is a new and experimental backend that - should be used with caution. See [our experimental WAL backend testing - guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) - to learn how to safely evaluate it for your workload. + is `boltdb`. The `wal` option specifies an experimental backend that + should be used with caution. Refer to + [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + for more information. - `disable_log_cache` ((#raft_logstore_disable_log_cache)) This allows disabling of the in-memory cache of recent logs. This exists mostly for @@ -1637,47 +1637,45 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." on that server. The only correct response is to stop the server, remove its data directory, and restart so it can be caught back up with a correct server again. Please report verification failures including details about - your hardware and workload via GitHub issues. See [our experimental WAL - backend testing - guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) - for more details on using these to evaluate a new backend. - - - `enabled` ((#raft_logstore_verification_enabled)) - Setting this to `true` - will enable log verification checkpoints to be written (if leader) and - verified on this server. - - - `interval` ((#raft_logstore_verification_interval)) - The time interval - between checkpoints. There is no default so both `enabled` and `interval` - must be set explicitly to correctly enable. An interval of `30s` to `5m` - is most likely to be useful. The performance overhead should not be - significant at any level above a few seconds, it's mostly useful to + your hardware and workload via GitHub issues. Refer to + [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + for more information. + + - `enabled` ((#raft_logstore_verification_enabled)) - Set to `true` to + allow this Consul server to write and verify log verification checkpoints + when it is elected leader. + + - `interval` ((#raft_logstore_verification_interval)) - Specifies the time + interval between checkpoints. There is no default value. You must + configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) + to `true` to correctly enable intervals. We recommend using an interval + between `30s` and `5m`. The performance overhead is insignificant if the + interval is set to `5m` or less. We recommend setting an interval to control how frequently the report logs appear for human observation. - - `boltdb` ((#raft_logstore_boltdb)) - This is a nested object that allows - configuring options for Raft's `boltdb` backend. It has no effect if the - `backend` is not `boltdb`. + - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for + Raft's `boltdb` backend. It has no effect if the `backend` is not `boltdb`. - - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Setting this - to `true` will disable syncing the BoltDB freelist to disk within the - raft.db file. Not syncing the freelist to disk will reduce disk IO required - for write operations at the expense of potentially increasing start up time - due to needing to scan the db to discover where the free space resides + - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to + `true` to disable storing BoltDB freelist to disk within the + `raft.db` file. Disabling freelist syncs reduces the disk IO required + for write operations, but could potentially increase start up time + because Consul must scan the database to find free space within the file. - - `wal` ((#raft_logstore_wal)) - This is a nested object that allows - configuring the `wal` backend. See [our experimental WAL backend testing - guide](/consul/docs/upgrading/instructions/testing-experimental-raft-backend) - for more details on safely evaluating this backend. - - - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - This is an - integer representing the target size (in MB) for each segment file before - rolling to a new segment. The default is 64 and should be suitable for - almost all deployments. A smaller value may use less disk space since - it can be reclaimed by deleting old segments sooner at the expense of - performing a more costly operation to safely rotate to a new file more - frequently which could impact tail latencies. Larger values are unlikely - to improve performance significantly. This is mostly exposed for - performance testing purposes. + - - `wal` ((#raft_logstore_wal)) - Object that configures the `wal` backend. + Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + for more information. + + - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value + that represents the target size in MB for each segment file before + rolling to a new segment. The default is `64` and is suitable for + most deployments. A smaller value may use less disk space because you + can reclaim space by deleting old segments sooner, but a smaller segment + may affect performance because safely rotating to a new file more + frequently could impact tail latencies. Larger values are unlikely + to improve performance significantly. We recommend using this + configuration for performance testing purposes. - `raft_protocol` ((#raft_protocol)) Equivalent to the [`-raft-protocol` command-line flag](/consul/docs/agent/config/cli-flags#_raft_protocol). diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index fb79f71d9dcb..df8fdef15dc4 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -342,10 +342,7 @@ may help to reduce disk IO and log storage operation times. Disabling free list the startup time for a server as it must scan the raft.db file for free space instead of loading the already populated free list structure. -~> Note that as of Consul 1.15 there is a [new experimental storage -backend](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend) -that can be trialed instead of BoltDB. - +Consul includes an experiment backend configuration that you can use instead of BoldDB. Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) for more information. ## Metrics Reference @@ -457,9 +454,11 @@ These metrics are used to monitor the health of the Consul servers. | `consul.raft.leader.dispatchLog` | Measures the time it takes for the leader to write log entries to disk. | ms | timer | | `consul.raft.leader.dispatchNumLogs` | Measures the number of logs committed to disk in a batch. | logs | gauge | | `consul.raft.logstore.verifier.checkpoints_written` | Counts the number of checkpoint entries written to the LogStore. | checkpoints | counter | -| `consul.raft.logstore.verifier.dropped_reports` | Counts how many times the verifier routine was still busy when the next checksum came in and so verification for a range was skipped. If you see this happen, consider increasing the interval between checkpoints with [`raft_logstore.verification.interval`](http://localhost:3000/consul/docs/agent/config/config-files#raft_logstore_verification) | reports dropped | counter | -| `consul.raft.logstore.verifier.ranges_verified` | Counts the number of log ranges for which a verification report has been completed. See [monitoring experimental backends](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend#monitoring) for more information. | log ranges verifications | counter | -| `consul.raft.logstore.verifier.read_checksum_failures` | Counts the number of times a range of logs between two check points contained at least one disk corruption. See [monitoring experimental backends](http://localhost:3000/consul/docs/upgrading/instructions/testing-experimental-raft-backend#monitoring) for more information. | disk corruptions | counter | +| `consul.raft.logstore.verifier.dropped_reports` | Counts how many times the verifier routine was still busy when the next checksum came in and so verification for a range was skipped. If you see this happen, consider increasing the interval between checkpoints with [`raft_logstore.verification.interval`](/consul/docs/agent/config/config-files#raft_logstore_verification) | reports dropped | counter | +| `consul.raft.logstore.verifier.ranges_verified` | Counts the number of log ranges for which a verification report has been completed. Refer to [Monitor Raft metrics and logs for WAL +](/consul/docs/agent/wal-logstore/monitoring) for more information. | log ranges verifications | counter | +| `consul.raft.logstore.verifier.read_checksum_failures` | Counts the number of times a range of logs between two check points contained at least one disk corruption. Refer to [Monitor Raft metrics and logs for WAL +](/consul/docs/agent/wal-logstore/monitoring) for more information. | disk corruptions | counter | | `consul.raft.logstore.verifier.write_checksum_failures` | Counts the number of times a follower has a different checksum to the leader at the point where it writes to the log. This could be caused by either a disk-corruption on the leader (unlikely) or some other corruption of the log entries in-flight. | in-flight corruptions | counter | | `consul.raft.leader.lastContact` | Measures the time since the leader was last able to contact the follower nodes when checking its leader lease. It can be used as a measure for how stable the Raft timing is and how close the leader is to timing out its lease.The lease timeout is 500 ms times the [`raft_multiplier` configuration](/consul/docs/agent/config/config-files#raft_multiplier), so this telemetry value should not be getting close to that configured value, otherwise the Raft timing is marginal and might need to be tuned, or more powerful servers might be needed. See the [Server Performance](/consul/docs/install/performance) guide for more details. | ms | timer | | `consul.raft.leader.oldestLogAge` | The number of milliseconds since the _oldest_ log in the leader's log store was written. This can be important for replication health where write rate is high and the snapshot is large as followers may be unable to recover from a restart if restoring takes longer than the minimum value for the current leader. Compare this with `consul.raft.fsm.lastRestoreDuration` and `consul.raft.rpc.installSnapshot` to monitor. In normal usage this gauge value will grow linearly over time until a snapshot completes on the leader and the log is truncated. Note: this metric won't be emitted until the leader writes a snapshot. After an upgrade to Consul 1.10.0 it won't be emitted until the oldest log was written after the upgrade. | ms | gauge | diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx new file mode 100644 index 000000000000..c73feb31b75b --- /dev/null +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -0,0 +1,143 @@ +--- +layout: docs +page_title: Enable the experimental WAL LogStore backend +description: >- + Learn how to safely configure and test the experimental WAL backend in your Consul deployment. +--- + +# Enable the experimental WAL LogStore backend + +This topic describes how to safely configure and test the WAL backend in your Consul deployment. + +The overall process for enabling the WAL LogStore backend for one server consists of the following steps. You will need to repeat these steps for all Consul servers. + +1. Enable log verification. +1. Select target server to enable WAL. +1. Stop target server gracefully. +1. Remove data directory from target server. +1. Update target server's configuration. +1. Start target server. +1. Monitor target server raft metrics and logs. + +!> **Upgrade warning:** The WAL LogStore backend is experimental. + +## Requirements + +- All servers in the Datacenter should be upgraded to Consul 1.15 using the [standard upgrade procedure](/consul/docs/upgrading/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x). +- You need a Consul cluster with at least 3 nodes to safely test the new backend without downtime. + +In addition, we recommend: + +- Taking a snapshot prior to testing in case things don't go to plan. +- Monitoring Consul server metrics and logs, and setting an alert on specific log events occurring. +- Enabling WAL in a pre-production environment and leave it running for a period of time (few days or weeks) before enabling it in production. + +## Risks + +Although this document describes configuring and testing the WAL backend in a way that limits risk, there still are potential risks: + + - If WAL is enabled on a server and is found to corrupt data in some way. That server's data can't be recovered. The server will need to be restarted with an empty data directory and reload it's state from the leader. + - It's possible that WAL might corrupt data or otherwise have a bug that causes the server to panic and crash. It may even not be able to restart if the same bug occurs when it reads from the logs on startup. In this case as above the server would need to be restarted with WAL disabled and an empty data directory. + - It's _possible_ though unlikely that if WAL corrupted data, clients might read corrupted data from that server. For example invalid IP addresses or have tokens fail to match. This is unlikely even if there is a WAL corruption bug hit because replication typically takes place using in-memory cached objects rather than reads from disk. The issue would be fixed by restoring the server. + - If you enable a server to use WAL using Consul OSS or on a voting server with Consul Enterprise, it's _possible_ that the WAL could cause corruption of that server's state (with the caveats above) _and then_ become the leader and replicate that corruption to all other servers. In this scenario only a restore from backup would recover a completely un-corrupt state. - If you test on a non-voting server in Enterprise, this can't happen. + + +## Enable log verification + +You must enable log verification on any voting server in Enterprise and all servers in OSS since the leader writes verification checkpoints. + +On each voting server, add the following to the server's configuration file: + +```hcl +raft_logstore { + verification { + enabled = true + interval = "60s" + } +} +``` + +Restart each server to apply the changes (`consul reload` is not sufficient). Wait for each one to become a healthy voter again using `consul operator raft list-peers` before moving on to the next. This can take a few minutes if the snapshot is large. + +Your log entries on the servers be similar to the following: + +```log hideClipboard +2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c +``` + +## Select target server to enable WAL + +If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should select a non-voting server at first. + +Retrieve the current state of the servers by running the following: + +```shell-session +$ consul operator raft list-peers +``` + +## Stop target server + +Stop the target server gracefully. For example, if you are using `systemcmd`, +run the following command: + +```shell-session +$ systemctl stop consul +``` + +If you have any configuration management automation that might interfere with this process (for example, Chef or Puppet), you must disable them until you have completely enabled WAL as a storage backend. + +## Remove data directory from target server + +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. However, once you restart the server, you should not use the old data directory (`/data-dir/raft.bak`) for recovery, and will eventually need to delete it. + +Move the data directory. Replace `/data-dir` the value you have specified in your configuration file. + +```shell-session +$ mv /data-dir/raft /data-dir/raft.bak +``` + +When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory since the log must always be consistent with the snapshots to avoid undefined behavior or data loss. + +## Update target server's configuration + +Add the following to the target server's configuration file: + +```hcl +raft_logstore { + backend = "wal" + verification { + enabled = true + interval = "60s" + } +} +``` + +## Start target server + +Start the target server. For example, if you are using `systemcmd`, run the following command: + +```shell-session +$ systemctl start consul +``` + +Watch for the server to become a healthy voter again. + +```shell-session +$ consul operator raft list-peers +``` + +## Monitor target server Raft metrics and logs + +See the section below on [monitoring WAL tests](/consul/docs/agent/wal-logstore/monitoring). + +We recommend you leave the cluster in this configuration for days or weeks, assuming that you see no negative metrics or verification errors in logs to increase confidence in the WAL under varying workloads and during routine server restarts etc. + +If you disabled Chef, Puppet or similar earlier, you may want to consider enabling it again while the test runs. Ensure that it will not "fix" the Consul configuration file and remove the different backend though. + +## Next steps + +- If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). + +- If you see no errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting a while after each and slowly rolling out. Once the majority of your servers are using WAL any bugs not yet found could result in cluster unavailability. + +- If you wish to permanently enable `wal` on all servers, you'll need to follow the above steps on each one. Even if `backend = "wal"` is set in logs, servers will continue to use BoltDB if they find an existing raft.db file in the data dir. \ No newline at end of file diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx new file mode 100644 index 000000000000..c48144f8434f --- /dev/null +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -0,0 +1,58 @@ +--- +layout: docs +page_title: WAL LogStore Backend Overview +description: >- + Consul 1.15 introduced a new experimental storage backend option. Learn how to + configure and test it out in a safe way. +--- + +# Experimental WAL LogStore backend overview + +This topic provides an overview of the experimental WAL (write-ahead log) LogStore backend. + +!> **Upgrade warning:** The WAL LogStore backend is experimental. + +## Introduction + +Consul ships with an experimental storage backend called write-ahead log (WAL). + +WAL implements a traditional log with rotating, append-only log files. The current `LogStore` uses BoltDB, which is a copy-on-write BTree, which is less optimized for append-only workloads. + +### WAL versus BoldDB + +The WAL backend has been written to resolve some long-standing issues with the current BoltDB backend. The existing BoltDB log store has worked reliably for most users for years, however it is not the most efficient way to store append-only logs to disk since it was designed as a full key-value database. It was an expedient option when our raft library was first written and always assumed we'd replace it with something more purpose-built. + +Importantly, a BoltDB database is a single file that only ever grows. Deleting the oldest logs which we do regularly when we've made a new snapshots of the state, leaves free space in the file that needs to be tracked to be re-used on future writes. By contrast a simple segmented log can just delete the oldest log files from disk. When BoltDB is used as a log backend, sudden burst of writes at a rate 2-3x higher than the normal volume, can suddenly cause the log file to grow to several times it's steady-state size. After the next snapshot is taken, and the oldest logs truncated again, the file is left as mostly empty space. Tracking this free space requires writing extra metadata proportional to the amount of free pages to disk with every write and so after such a burst, write latencies tend to increase - in some cases dramatically causing serious performance degradation to the cluster. + +Even if this has never happened to a catastrophic degree in a cluster, Consul was tuned to avoid for too many logs to accumulate in the LogStore, in order to reduce/mitigate the risk. Significantly larger BoltDB files are somewhat slower in general because it's a tree and so still has log(N) work to do `n` every write. But out user's experience showed that the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the default options for how frequently we make a full snapshot and truncate the logs, and for how many logs we keep around have always been aggressively set towards keeping BoltDB small rather than using disk IO the most efficiently. + +Other reliability issues such as [followers being unable to catch +up](/consul/docs/agent/telemetry#raft-replication-capacity-issues) also stem +from this need to carefully balance the size of the BoltDB log store against how +long snapshots take to restore - there is a simple solution to that issue if +letting logs grow much larger to ensure recovery didn't have a potentially +catastrophic impact on the cluster's write performance. + +While not every user will experience a huge difference in performance, the WAL +backend avoids these performance concerns entirely. It is more performant when +directly measured due to solving a simpler storage problem than BoltDB was +designed for. For example, it can commit a single log entry with one `fsync` instead +of two, and tends to write 2-3x fewer bytes to the disk to do it. The real +benefit though is that retaining more logs won't impact write performance at all +and so strategies for reducing disk IO with slower snapshots or for keeping logs around to permit slower followers to catch up with cluster state, are all possible. + +## Benefits + +The new WAL backend has been tested thoroughly during development: + +- Every component in the WAL like [metadata management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), [log file encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) to actual [file-system interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) was abstracted so unit tests can simulate all sorts of difficult-to-reproduce disk failures. + +- We [used ALICE](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md) to exhaustively simulate thousands of possible crash failure scenarios and test that WAL correctly recovered from each. + +- We ran hundreds of tests over a few weeks in a performance testing cluster with checksum verification enabled and detected zero cases of data loss or corruption. We plan to continue testing this continuously over the next few months too before making it the default backend. + +However, we are well aware of both how complex and how critical disk-persistence is for our user's data. + +Our hope is that we will have many users at all degrees of scale try WAL in their environments after upgrading to 1.15 and report success or failure back so we have increased confidence before we make it the default for new clusters. + +This guide describes how to safely try and verify it without risking the availability of your cluster should there be a latent data-loss issue discovered. \ No newline at end of file diff --git a/website/content/docs/agent/wal-logstore/monitoring.mdx b/website/content/docs/agent/wal-logstore/monitoring.mdx new file mode 100644 index 000000000000..7d50fe4f2b8b --- /dev/null +++ b/website/content/docs/agent/wal-logstore/monitoring.mdx @@ -0,0 +1,86 @@ +--- +layout: docs +page_title: Monitor Raft metrics and logs for WAL +description: >- + Consul 1.15 introduced a new experimental storage backend option. Learn how to + configure and test it out in a safe way. +--- + +# Monitor Raft metrics and logs for WAL + +Throughout the testing period, it's important to monitor the cluster and especially the target server for signals that the WAL is not performing properly or is behaving incorrectly. + +!> **Upgrade warning:** The WAL LogStore backend is experimental. + +## Monitor for checksum failures + +If the log store verification fails on any server (whether it's running BoltDB or WAL backed), that is an **unrecoverable error**. It will look something like this in the logs: + +### Read Failures: Disk Corruption + +```log hideClipboard +2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: storage corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... readChecksum=0x45... +``` + +This indicates that the server read back different data to what it wrote to disk which signals corruption in the storage backend or filesystem. + +For convenience, we also increment a metric `consul.raft.logstore.verifier.read_checksum_failures` when this occurs. + +### Write Failures: In-flight Corruption + +It's also possible that you might see a different kind of checksum error: + +```log hideClipboard +2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: in-flight corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... followerWriteChecksum=0x45... +``` + +This indicates that the checksum on the follower didn't match the leader when it _wrote_ the logs which implies that the corruption happened in the network or software and not the log store. This likely doesn't indicate an issue with the storage backend but should be handled the same way. + +For convenience, we also increment a metric `consul.raft.logstore.verifier.write_checksum_failures` when this occurs. + +### Handling Checksum Failures + +If either type of corruption is detected, the only safe way to handle it is to follow the [revert to BoltDB procedure](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server is already using BoltDB, the same is true although this is likely to indicate a latent bug in BoltDB or a bug in our verification code that needs to be investigated. + +Please report all verification failures via a [GitHub +issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). + +In your report, include the following: + - Details of your server cluster configuration and hardware + - Logs around the failure message + - Context for how long they have been running the configuration + - Any metrics or description of the workload you have, e.g. how many raft + commits per second as well as the performance metrics described below + +We recommend setting up an alert on Consul server logs containing `verification checksum FAILED` or on the `consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The sooner a corrupt server is handled, the lower the chance of any of the [potential risks](/consul/docs/agent/wal-logstore/enable#risks) causing problems in your cluster. + +## Performance Metrics + +The key performance metrics to watch are: + +- `consul.raft.commitTime` measures the time to commit new writes on a quorum of + servers. It should be the same or lower after deploying WAL. Even if WAL is + faster for your workload and hardware, it may not be reflected in commitTime + until enough followers are using it that the leader doesn't have to wait for a + slower one (one in a cluster of three, two in a cluster of five etc.). + +- `consul.raft.rpc.appendEntries.storeLogs` measures the time spent persisting + logs to disk on each _follower_. It should be the same or lower for + WAL-enabled followers. + +- `consul.raft.replication.appendEntries.rpc` measures the time taken for each + `AppendEntries` RPC from the leader's perspective. If this is significantly + higher than `consul.raft.rpc.appendEntries` on the follower, it indicates a + known queuing issue in our raft library that will be fixed soon and is + essentially unrelated to the backend. The follower(s) with WAL enabled should + not be slower than the others. You can work out which follower each metric is + for by matching the `peer_id` label value to the server IDs listed by `consul + operator raft list-peers`. + +- `consul.raft.compactLogs` measures the time take to truncate the logs after a + snapshot. WAL-enabled servers should not be slower than BoltDB ones. + +- `consul.raft.leader.dispatchLog` measures the time spent persisting logs to + disk on the _leader_. It is only relevant if a WAL-enabled server becomes a + leader. It should be the same or lower than before when the leader was using + BoltDB. \ No newline at end of file diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx new file mode 100644 index 000000000000..3f95f4f7786a --- /dev/null +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -0,0 +1,89 @@ +--- +layout: docs +page_title: Revert to BoltDB +description: >- + Learn how to safely configure and test the experimental WAL backend in your Consul deployment. +--- + +# Revert storage backend to BoltDB from WAL + +This topic describes revert your Consul storage backend from the experimental WAL LogStorage backend to the default BoltDB. + +The overall process for reverting to BoltDB consists of the following steps. You will need to repeat these steps for all Consul servers. Notice this is very similar to enabling WAL. + +1. Select target server. +1. Stop target server gracefully. +1. Remove data directory from target server. +1. Update target server's configuration. +1. Start target server. + +## Select target server + +If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](/consul/docs/agent/wal-logstore/enable#risks), Consul Enterprise users with non-voting servers should select a non-voting server at first. + +Retrieve the current state of the servers by running the following: + +```shell-session +$ consul operator raft list-peers +``` + +## Stop target server gracefully + +Stop the target server gracefully. For example, if you are using `systemcmd`, +run the following command: + +```shell-session +$ systemctl stop consul +``` + +If you have any configuration management automation that might interfere with this process (for example, Chef or Puppet), you must disable them until you have completely enabled WAL +as a storage backend. + + +## Remove data directory from target server + +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. However, once you restart the server, you should not use the old data directory (`/data-dir/raft.bak`) for recovery, and will eventually need to delete it. + +Move the data directory. Replace `/data-dir` the value you have specified in your configuration file. + +```shell-session +$ mv /data-dir/raft /data-dir/raft.bak +``` + +When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory since the log must always be consistent with the snapshots to avoid undefined behavior or data loss. + +## Update target server's configuration + +Modify the `backend` in the target server's configuration file: + +```hcl +raft_logstore { + backend = "boltdb" + verification { + enabled = true + interval = "60s" + } +} +``` + +## Start target server + +Start the target server. For example, if you are using `systemcmd`, run the following command: + +```shell-session +$ systemctl start consul +``` + +Watch for the server to become a healthy voter again. + +```shell-session +$ consul operator raft list-peers +``` + +### Clean up old data directories + +If necessary, cleanup any `raft.bak` directories. Replace `/data-dir` the value you have specified in your configuration file. + +```shell-session +$ rm /data-dir/raft.bak +``` \ No newline at end of file diff --git a/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx b/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx deleted file mode 100644 index e5360eab0918..000000000000 --- a/website/content/docs/upgrading/instructions/testing-experimental-raft-backend.mdx +++ /dev/null @@ -1,420 +0,0 @@ ---- -layout: docs -page_title: Testing the Experimental WAL LogStore Backend -description: >- - Consul 1.15 introduced a new experimental storage backend option. Learn how to - configure and test it out in a safe way. ---- - -# Testing the Experimental WAL LogStore Backend - -## Introduction - -Consul 1.15 introduced a new experimental storage backend. This guide explains -how to configure it and test it out safely. The new backend is called `wal` in -configuration. - -WAL is a acronym for "Write-Ahead Log". We called it this because it implements -a traditional log with rotating, append-only log files. The current `LogStore` -uses BoltDB which is a copy-on-write BTree which is less optimized for -append-only workloads. - -~> The `wal` backend is considered **experimental** in Consul 1.15. Please test -it safely in pre-production first and verify on a subset of servers in -production using this guide before fully enabling it on all servers. - -## Why build a new backend? - -The WAL backend has been written to resolve some long-standing issues with the -current BoltDB backend. The existing BoltDB log store has worked reliably for -most users for years, however it is not the most efficient way to store -append-only logs to disk since it was designed as a full key-value database. It -was an expedient option when our raft library was first written and always -assumed we'd replace it with something more purpose-built. - -Importantly, a BoltDB database is a single file that only ever grows. Deleting -the oldest logs which we do regularly when we've made a new snapshots of the -state, leaves free space in the file that needs to be tracked to be re-used on -future writes. By contrast a simple segmented log can just delete the oldest log -files from disk. With BoltDB uses as a log, sudden burst of writes say three -times larger than the normal volume can suddenly cause the file to grow to about -several times it's steady-state size. After the next snapshot is taken, and the -oldest logs truncated again, the file is left as mostly empty space. Tracking -this free space requires writing extra metadata proportional to the amount of -free pages to disk with every write and so after such a burst, write latencies -tend to increase - in some cases dramatically causing serious performance -degradation to the cluster. - -Even if this has never happened to a catastrophic degree in a cluster, the fact -that it's a risk has meant that Consul has erred on the side of never letting -too many logs accumulate in the LogStore. Significantly larger BoltDB files are -somewhat slower in general because it's a tree and so still has log(N) work to -do n every write. But out user's experience showed that the larger the file, the -more likely it is to have a large freelist or suddenly form one after a burst of -writes. For this reason, the default options for how frequently we make a full -snapshot and truncate the logs, and for how many logs we keep around have always -been aggressively set towards keeping BoltDB small rather than using disk IO the -most efficiently. - -Other reliability issues such as [followers being unable to catch -up](/consul/docs/agent/telemetry#raft-replication-capacity-issues) also stem -from this need to carefully balance the size of the BoltDB log store against how -long snapshots take to restore - there is a simple solution to that issue if -letting logs grow much larger to ensure recovery didn't have a potentially -catastrophic impact on the cluster's write performance. - -While not every user will experience a huge difference in performance, the WAL -backend avoids these performance concerns entirely. It is more performant when -directly measured due to solving a simpler storage problem than BoltDB was -designed for. For example it can commit a single log entry with on fsync instead -of two, and tends to write 2-3x fewer bytes to the disk to do it. The real -benefit though is that retaining more logs won't impact write performance at all -and so reducing disk IO with slower snapshots or keeping logs around to catch up -slower followers are all possible. - -## Why try it out? - -The new WAL backend has been tested thoroughly during development: - * Every component in the WAL like [metadata - management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), - [log file - encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) - to actual [file-system - interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) - was abstracted so unit tests can simulate all sorts of difficult-to-reproduce - disk failures. - * We [used - ALICE](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md), to - exhaustively simulate thousands of possible crash failure scenarios and test - that WAL correctly recovered from each. - * We ran hundreds of tests over a few weeks in a performance testing cluster - with checksum verification enabled and detected zero cases of data loss or - corruption. We plan to continue testing this continuously over the next few - months too before making it the default backend. - -However, we are well aware of both how complex and how critical disk-persistence -is for our user's data. - -Our hope is that we will have many users at all degrees of scale try WAL in -their environments after upgrading to 1.15 and report success or failure back so -we have increased confidence before we make it the default for new clusters. - -This guide describes how to safely try and verify it without risking the -availability of your cluster should there be a latent data-loss issue -discovered. - -## Requirements - -- All servers in the Datacenter should be upgraded to Consul 1.15 using the - [standard upgrade procedure](/consul/docs/upgrading/general-process) and - the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x). -- You need a Consul cluster with at least 3 nodes to safely test the new - backend without downtime. - -## Assumptions - -This guide makes the following assumptions: - -- You have taken a snapshot prior to testing in case things don't go to plan. -- You have the ability to monitor Consul server metrics and logs, and ideally - set an alert on specific log events occurring. -- We assume that you will try this in a pre-production environment first and - leave it running for a few days or weeks to gain confidence before attempting - it in production. - -## Risks - -Although we are describing a way to test this that limits risk as far as -possible, there still are potential risks it's important to know: - - - If WAL is enabled on a server and is found to corrupt data in some way. That - server's data can't be recovered. The server will need to be restarted with - an empty data directory and reload it's state from the leader. - - It's possible that WAL might corrupt data or otherwise have a bug that causes - the server to panic and crash. It may even not be able to restart if the same - bug occurs when it reads from the logs on startup. In this case as above the - server would need to be restarted with WAL disabled and an empty data - directory. - - It's _possible_ though unlikely that if WAL corrupted data, clients might - read corrupted data from that server. For example invalid IP addresses or - have tokens fail to match. This is unlikely even if there is a WAL corruption - bug hit because replication typically takes place using in-memory cached - objects rather than reads from disk. The issue would be fixed by restoring - the server. - - If you enable a server to use WAL using Consul OSS or on a voting server with - Consul Enterprise, it's _possible_ that the WAL could cause corruption of - that server's state (with the caveats above) _and then_ become the leader and - replicate that corruption to all other servers. In this scenario only a - restore from backup would recover a completely un-corrupt state. - - If you test on a non-voting server in Enterprise, this can't happen. - -## Procedure to enable WAL on one server - -**1. Enable log verification (new in 1.15).** - -This needs to be enabled on any voting server in Enterprise and all servers in -OSS since it is the leader that writes verification checkpoints. - -On each voting server add the following to the server's configuration file: - -```hcl -raft_logstore { - verification { - enabled = true - interval = "60s" - } -} -``` - -You'll need to restart each server in turn for that to take effect (`consul -reload` is not sufficient). Wait for each one to become a healthy voter again -using `consul operator raft list-peers` before moving on to the next. This can -take a few minutes if the snapshot is large. - -You should now see log entries on the servers every minute that look like this: - -``` -2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c -``` - - -**2. Select a server to enable WAL on.** - -As noted in [Risks](#risks), Consul Enterprise users should select a non-voting -server at first. For Consul OSS users, or Enterprise users who don't have -non-voting servers, select one of the follower servers. - -You can find the current state of the servers by running. - -```shell-session -$ consul operator raft list-peers -``` - -We'll refer to this server as the "target" server from now on. - - -**3. Stop the target server gracefully** - -For example using `systemctl stop consul` if using systemd. - -~> If you have any configuration management automation like Chef or Puppet that -might interfere with this process, they must be disabled until the process is -complete. - - -**4. Remove the data directory from the target server.** - -Moving rather than deleting is less destructive in case of command errors. Once -the server has restarted though, the data in the old dir should not be used for -recovery and will eventually need to be removed entirely. - -Replace `/data-dir` with whatever is set in `data_dir` in your configuration -file. - -```shell-session -$ mv /data-dir/raft /data-dir/raft.bak -``` - - -**5. Update the target server's configuration.** - -Add the following to the target server's configuration file: - -```hcl -raft_logstore { - backend = "wal" - verification { - enabled = true - interval = "60s" - } -} -``` - -**6. Start the target server.** - -For example with `systemctl start consul` if using systemd. - -**7. Monitor target server raft metrics and logs.** - -See the section below on [monitoring WAL tests](#monitoring-wal-tests). - -We suggest you leave the cluster in this configuration for days or weeks, -assuming that you see no negative metrics or verification errors in logs to -increase confidence in the WAL under varying workloads and during routine server -restarts etc. - -~> If you disabled Chef, Puppet or similar earlier, you may want to consider -enabling it again while the test runs. Ensure that it will not "fix" the Consul -configuration file and remove the different backend though. - -**Next Steps.** - - * If you see any verification errors, performance anomalies or other suspicious -behavior from the target server during the test, you should follow [the -procedure to revert back to BoltDB](#procedure-to-revert-to-boltdb). - - * If you see no errors and would like to expand the test further, you can repeat -the above procedure on another target server. We suggest waiting a while after -each and slowly rolling out. Once the majority of your servers are using WAL any -bugs not yet found could result in cluster unavailability. - - * If you wish to permanently enable `wal` on all servers, you'll need to follow -the above steps on each one. Even if `backend = "wal"` is set in logs, servers -will continue to use BoltDB if they find an existing raft.db file in the data -dir. - -~> When switching backend, you must always remove the _whole raft directory_ -not just the `raft.db` file or `wal` directory since the log must always be -consistent with the snapshots to avoid undefined behavior or data loss. - -## Procedure to revert to BoltDB - -To revert a server that is using `wal` to using BoltDB, the steps are -essentially the same as the procedure above. - -**1. Stop the target server gracefully** - -For example using `systemctl stop consul` if using systemd. - -~> If you have any configuration management automation like Chef or Puppet that -might interfere with this process, they must be disabled until the process is -complete. - - -**2. Remove the data directory from the target server.** - -Moving rather than deleting is less destructive in case of command errors. Once -the server has restarted though, the data in the old dir should not be used for -recovery and will eventually need to be removed entirely. - -Replace `/data-dir` with whatever is set in `data_dir` in your configuration -file. - -```shell-session -$ mv /data-dir/raft /data-dir/raft.bak -``` - - -**3. Update the target server's configuration.** - -Modify the `backend` in the target server's configuration file: - -```hcl -raft_logstore { - backend = "boltdb" - verification { - enabled = true - interval = "60s" - } -} -``` - -**4. Start the target server.** - -For example with `systemctl start consul` if using systemd. - -**5. Watch for the server to become a healthy voter again.** - -```shell-session -$ consul operator raft list-peers -``` - -**6. If necessary, cleanup any `raft.bak` directories.** - -```shell-session -$ rm /data-dir/raft.bak -``` - -## Monitoring - -Throughout the testing period, it's important to monitor the cluster and -especially the target server for signals that the WAL is not performing properly -or is behaving incorrectly. - -### Monitor for checksum failures - -If the log store verification fails on any server (whether it's running BoltDB -or WAL backed), that is an **unrecoverable error**. It will look something like -this in the logs: - -#### Read Failures: Disk Corruption - -``` -2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: storage corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... readChecksum=0x45... -``` -This indicates that the server read back different data to what it wrote to disk -which signals corruption in the storage backend or filesystem. - -For convenience we also increment a metric -`consul.raft.logstore.verifier.read_checksum_failures` when this occurs. - -#### Write Failures: In-flight Corruption - -It's also possible that you might see a different kind of checksum error: - -``` -2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: in-flight corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... followerWriteChecksum=0x45... -``` - -This indicates that the checksum on the follower didn't match the leader when it -_wrote_ the logs which implies that the corruption happened in the network or -software and not the log store. This likely doesn't indicate an issue with the -storage backend but should be handled the same way. - -For convenience we also increment a metric -`consul.raft.logstore.verifier.write_checksum_failures` when this occurs. - -#### Handling Checksum Failures - -If either type of corruption is detected, the only safe way to handle it is to -follow the [revert to BoltDB procedure](#procedure-to-revert-to-boltdb). If the -server is already using BoltDB, the same is true although this is likely to -indicate a latent bug in BoltDB or a bug in our verification code that needs to -be investigated. - -Please report all verification failures via a [GitHub -issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). - -It would really help us if you can include: - - Details of your server cluster configuration and hardware - - Logs around the failure message - - Context for how long they have been running the configuration - - Any metrics or description of the workload you have, e.g. how many raft - commits per second as well as the performance metrics described below - -~> We recommend setting up an alert on Consul server logs containing -`verification checksum FAILED` or on the -`consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The -sooner a corrupt server is handled, the lower the chance of any of the -[potential risks](#risks) causing problems in your cluster. - -### Performance Metrics - -The key performance metrics to watch are: - -* `consul.raft.commitTime` measures the time to commit new writes on a quorum of - servers. It should be the same or lower after deploying WAL. Even if WAL is - faster for your workload and hardware, it may not be reflected in commitTime - until enough followers are using it that the leader doesn't have to wait for a - slower one (one in a cluster of three, two in a cluster of five etc.). - -* `consul.raft.rpc.appendEntries.storeLogs` measures the time spent persisting - logs to disk on each _follower_. It should be the same or lower for - WAL-enabled followers. - -* `consul.raft.replication.appendEntries.rpc` measures the time taken for each - `AppendEntries` RPC from the leader's perspective. If this is significantly - higher than `consul.raft.rpc.appendEntries` on the follower, it indicates a - known queuing issue in our raft library that will be fixed soon and is - essentially unrelated to the backend. The follower(s) with WAL enabled should - not be slower than the others. You can work out which follower each metric is - for by matching the `peer_id` label value to the server IDs listed by `consul - operator raft list-peers`. - -* `consul.raft.compactLogs` measures the time take to truncate the logs after a - snapshot. WAL-enabled servers should not be slower than BoltDB ones. - -* `consul.raft.leader.dispatchLog` measures the time spent persisting logs to - disk on the _leader_. It is only relevant if a WAL-enabled server becomes a - leader. It should be the same or lower than before when the leader was using - BoltDB. \ No newline at end of file diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index c1ca8961c0c2..59b51be5666e 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -294,10 +294,6 @@ { "title": "Upgrading to Latest 1.10.x", "path": "upgrading/instructions/upgrade-to-1-10-x" - }, - { - "title": "Consul 1.15: Experimental WAL LogStore", - "path": "upgrading/instructions/testing-experimental-raft-backend" } ] } @@ -771,6 +767,27 @@ "title": "RPC", "path": "agent/rpc", "hidden": true + }, + { + "title": "Experimental WAL LogStore", + "routes": [ + { + "title": "Overview", + "path": "agent/wal-logstore" + }, + { + "title": "Enable WAL LogStore backend", + "path": "agent/wal-logstore/enable" + }, + { + "title": "Monitor Raft metrics and logs for WAL", + "path": "agent/wal-logstore/monitoring" + }, + { + "title": "Revert to BoltDB", + "path": "agent/wal-logstore/revert-to-boltdb" + } + ] } ] }, From b301bf947fb6aa4c6fcd3553bd8a81bf8bf2bef1 Mon Sep 17 00:00:00 2001 From: trujillo-adam Date: Thu, 23 Feb 2023 10:00:35 -0800 Subject: [PATCH 04/14] Updates to the WAL overview page --- .../content/docs/agent/wal-logstore/index.mdx | 48 +++++++------------ 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx index c48144f8434f..f0e8cd3c8e5d 100644 --- a/website/content/docs/agent/wal-logstore/index.mdx +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -8,51 +8,35 @@ description: >- # Experimental WAL LogStore backend overview -This topic provides an overview of the experimental WAL (write-ahead log) LogStore backend. +This topic provides an overview of the experimental WAL (write-ahead log) LogStore backend. !> **Upgrade warning:** The WAL LogStore backend is experimental. -## Introduction +## WAL versus BoltDB -Consul ships with an experimental storage backend called write-ahead log (WAL). +WAL implements a traditional log with rotating, append-only log files. WAL resolves many issues with the existing `LogStore` provided by the BoltDB backend. The BoltDB `LogStore` is a copy-on-write BTree, which is not optimized for append-only workloads. -WAL implements a traditional log with rotating, append-only log files. The current `LogStore` uses BoltDB, which is a copy-on-write BTree, which is less optimized for append-only workloads. +The existing BoltDB log store inefficiently stores append-only logs to disk because it was designed as a full key-value database. It is a single file that only ever grows. Deleting the oldest logs, which Consul does regularly when it makes new snapshots of the state, leaves free space in the file. The free space must be tracked so that Consul can reuse it on future writes. By contrast, a simple segmented log can delete the oldest log files from disk. -### WAL versus BoldDB +When BoltDB is used as a log backend, a sudden burst of writes at a rate two to three times higher than the normal volume can suddenly cause the log file to grow to several times its steady-state size. After Consul takes the next snapshot and truncates the oldest logs, the resulting file is mostly empty space. To track the free space, Consul must write extra metadata to disk with every write. The metadata is proportional to the amount of free pages, so after a large burst write latencies tend to increase. In some cases, the latencies cause serious performance degradation to the cluster. -The WAL backend has been written to resolve some long-standing issues with the current BoltDB backend. The existing BoltDB log store has worked reliably for most users for years, however it is not the most efficient way to store append-only logs to disk since it was designed as a full key-value database. It was an expedient option when our raft library was first written and always assumed we'd replace it with something more purpose-built. +To mitigate risks associated with sudden bursts of log data, Consul prevents too many logs from accumulating in the LogStore. Significantly larger BoltDB files are generally slower to process because the file is structured as a tree. As a result, Consul must iterate through the log for each write. But the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the many of Consul's default options asssociated with snapshots, truncating logs, and keeping the log history have been aggressively set toward keeping BoltDT small rather than uisng disk IO more efficiently. -Importantly, a BoltDB database is a single file that only ever grows. Deleting the oldest logs which we do regularly when we've made a new snapshots of the state, leaves free space in the file that needs to be tracked to be re-used on future writes. By contrast a simple segmented log can just delete the oldest log files from disk. When BoltDB is used as a log backend, sudden burst of writes at a rate 2-3x higher than the normal volume, can suddenly cause the log file to grow to several times it's steady-state size. After the next snapshot is taken, and the oldest logs truncated again, the file is left as mostly empty space. Tracking this free space requires writing extra metadata proportional to the amount of free pages to disk with every write and so after such a burst, write latencies tend to increase - in some cases dramatically causing serious performance degradation to the cluster. +Other reliability issues, such as [raft replication capacity issues](/consul/docs/agent/telemetry#raft-replication-capacity-issues), are also related to balancing the size of the BoltDB log store and length of snapshots. -Even if this has never happened to a catastrophic degree in a cluster, Consul was tuned to avoid for too many logs to accumulate in the LogStore, in order to reduce/mitigate the risk. Significantly larger BoltDB files are somewhat slower in general because it's a tree and so still has log(N) work to do `n` every write. But out user's experience showed that the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the default options for how frequently we make a full snapshot and truncate the logs, and for how many logs we keep around have always been aggressively set towards keeping BoltDB small rather than using disk IO the most efficiently. +Although some users may not experience a significant difference in performance, the WAL backend avoids performance concerns associated with BoltDB. When directly measured, WAL is more performant than BoltDB because it solves a simpler storage problem than BoltDB was designed to solve. For example, WAL can commit a single log entry with one `fsync` instead +of two and tends to write two to three fewer bytes to the disk to commit the `fsync`. The primary benefit is that retaining more logs does not affect write performance. As a result, strategies for reducing disk IO with slower snapshots or for keeping logs to permit slower followers to catch up with cluster state are all possible. -Other reliability issues such as [followers being unable to catch -up](/consul/docs/agent/telemetry#raft-replication-capacity-issues) also stem -from this need to carefully balance the size of the BoltDB log store against how -long snapshots take to restore - there is a simple solution to that issue if -letting logs grow much larger to ensure recovery didn't have a potentially -catastrophic impact on the cluster's write performance. +## WAL quality assurance -While not every user will experience a huge difference in performance, the WAL -backend avoids these performance concerns entirely. It is more performant when -directly measured due to solving a simpler storage problem than BoltDB was -designed for. For example, it can commit a single log entry with one `fsync` instead -of two, and tends to write 2-3x fewer bytes to the disk to do it. The real -benefit though is that retaining more logs won't impact write performance at all -and so strategies for reducing disk IO with slower snapshots or for keeping logs around to permit slower followers to catch up with cluster state, are all possible. +The WAL backend has been tested thoroughly during development: -## Benefits - -The new WAL backend has been tested thoroughly during development: - -- Every component in the WAL like [metadata management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), [log file encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) to actual [file-system interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) was abstracted so unit tests can simulate all sorts of difficult-to-reproduce disk failures. +- Every component in the WAL, such as [metadata management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), [log file encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) to actual [file-system interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) were abstracted so unit tests can simulate difficult-to-reproduce disk failures. -- We [used ALICE](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md) to exhaustively simulate thousands of possible crash failure scenarios and test that WAL correctly recovered from each. - -- We ran hundreds of tests over a few weeks in a performance testing cluster with checksum verification enabled and detected zero cases of data loss or corruption. We plan to continue testing this continuously over the next few months too before making it the default backend. +- We used the [application-level intelligent crash explorer (ALICE)](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md) to exhaustively simulate thousands of possible crash failure scenarios. WAL correctly recovered from all scenarios. -However, we are well aware of both how complex and how critical disk-persistence is for our user's data. +- We ran hundreds of tests in a performance testing cluster with checksum verification enabled and did not detect data loss or corruption. We will continue testing before making WAL the default backend. -Our hope is that we will have many users at all degrees of scale try WAL in their environments after upgrading to 1.15 and report success or failure back so we have increased confidence before we make it the default for new clusters. +We are aware of how complex and critical disk-persistence is for your data. -This guide describes how to safely try and verify it without risking the availability of your cluster should there be a latent data-loss issue discovered. \ No newline at end of file +Our goal is that many users at all degrees of scale try WAL in their environments after upgrading to 1.15 or later and report success or failure so that we can confidently replace BoltDB before as the default for new clusters. \ No newline at end of file From b4b92c828d2016ec2b323aa4c9dd16ef6b3b4fd9 Mon Sep 17 00:00:00 2001 From: trujillo-adam Date: Thu, 23 Feb 2023 11:05:28 -0800 Subject: [PATCH 05/14] updates to enable WAL usage topic --- .../docs/agent/wal-logstore/enable.mdx | 71 ++++++++++--------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index c73feb31b75b..da935cfa0d87 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -23,43 +23,44 @@ The overall process for enabling the WAL LogStore backend for one server consist ## Requirements -- All servers in the Datacenter should be upgraded to Consul 1.15 using the [standard upgrade procedure](/consul/docs/upgrading/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x). -- You need a Consul cluster with at least 3 nodes to safely test the new backend without downtime. +- Consul 1.15 or later is required for all servers in the datacenter. Refer to the [standard upgrade procedure](/consul/docs/upgrading/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x) for additional information. +- A Consul cluster with at least three nodes are required to safely test the WAL backend without downtime. -In addition, we recommend: +We recommend taking the following additional measures: -- Taking a snapshot prior to testing in case things don't go to plan. -- Monitoring Consul server metrics and logs, and setting an alert on specific log events occurring. -- Enabling WAL in a pre-production environment and leave it running for a period of time (few days or weeks) before enabling it in production. +- Take a snapshot prior to testing. +- Monitor Consul server metrics and logs and sett an alert on specific log events that occur when WAL is enabled. +- Enable WAL in a pre-production environment and run it for a several days before enabling it in production. ## Risks -Although this document describes configuring and testing the WAL backend in a way that limits risk, there still are potential risks: +The likelihood of the following potential risks is low to very low: - - If WAL is enabled on a server and is found to corrupt data in some way. That server's data can't be recovered. The server will need to be restarted with an empty data directory and reload it's state from the leader. - - It's possible that WAL might corrupt data or otherwise have a bug that causes the server to panic and crash. It may even not be able to restart if the same bug occurs when it reads from the logs on startup. In this case as above the server would need to be restarted with WAL disabled and an empty data directory. - - It's _possible_ though unlikely that if WAL corrupted data, clients might read corrupted data from that server. For example invalid IP addresses or have tokens fail to match. This is unlikely even if there is a WAL corruption bug hit because replication typically takes place using in-memory cached objects rather than reads from disk. The issue would be fixed by restoring the server. - - If you enable a server to use WAL using Consul OSS or on a voting server with Consul Enterprise, it's _possible_ that the WAL could cause corruption of that server's state (with the caveats above) _and then_ become the leader and replicate that corruption to all other servers. In this scenario only a restore from backup would recover a completely un-corrupt state. - If you test on a non-voting server in Enterprise, this can't happen. + - If WAL corrupts data on a Consul server agent, the server data cannot be recovered. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. + - WAL may corrupt data or contain a defect that causes the server to panic and crash. WAL may not restart if the defect is recurs when WAL reads from the logs on startup. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. + - Clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatching tokens, if WAL corrupts data. This is unlikely even if a recuring defect cause WAL to corrupt data because replication uses objects cached in memory rather than reads from disk. Restore the server to resolve the issue. + - If you enable a Consul OSS server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this scenario only a restore from backup would recover a completely un-corrupt state. Test WAL on a non-voting server in Enterprise to preven this scenario. ## Enable log verification -You must enable log verification on any voting server in Enterprise and all servers in OSS since the leader writes verification checkpoints. +You must enable log verification on any voting server in Enterprise and all servers in OSS becaise the leader writes verification checkpoints. -On each voting server, add the following to the server's configuration file: +1. On each voting server, add the following to the server's configuration file: -```hcl -raft_logstore { - verification { - enabled = true - interval = "60s" + ```hcl + raft_logstore { + verification { + enabled = true + interval = "60s" + } } -} -``` + ``` -Restart each server to apply the changes (`consul reload` is not sufficient). Wait for each one to become a healthy voter again using `consul operator raft list-peers` before moving on to the next. This can take a few minutes if the snapshot is large. +1. Restart each server to apply the changes. The `consul reload` is not sufficient to apply `raft_logstore` configuration changes. +1. Run the `consul operator raft list-peers` command to wait for each server to become a healthy voter before moving on to the next. This may take a few minutes for large snapshots. -Your log entries on the servers be similar to the following: +Whenc omplete, log entries for the servers should resemble the following status: ```log hideClipboard 2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c @@ -67,9 +68,9 @@ Your log entries on the servers be similar to the following: ## Select target server to enable WAL -If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should select a non-voting server at first. +If you are using Consul OSS or Consul Enterprise without non-voting servers, select a follower server to enable WAL. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should first select a non-voting server. -Retrieve the current state of the servers by running the following: +Retrieve the current state of the servers by running the following command: ```shell-session $ consul operator raft list-peers @@ -84,21 +85,21 @@ run the following command: $ systemctl stop consul ``` -If you have any configuration management automation that might interfere with this process (for example, Chef or Puppet), you must disable them until you have completely enabled WAL as a storage backend. +If your environment uses configuration management automation that might interfere with this process, such as Chef or Puppet, you must disable them until you have completely enabled WAL as a storage backend. ## Remove data directory from target server -Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. However, once you restart the server, you should not use the old data directory (`/data-dir/raft.bak`) for recovery, and will eventually need to delete it. +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. -Move the data directory. Replace `/data-dir` the value you have specified in your configuration file. +The following example moves the data atfrom `/data-dir` in the configuration file to `/temp/data-dir`. ```shell-session -$ mv /data-dir/raft /data-dir/raft.bak +$ mv /data-dir/raft /temp/data-dir/raft.bak ``` -When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory since the log must always be consistent with the snapshots to avoid undefined behavior or data loss. +When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is bedause the log must always be consistent with the snapshots to avoid undefined behavior or data loss. -## Update target server's configuration +## Update target server configuration Add the following to the target server's configuration file: @@ -128,16 +129,16 @@ $ consul operator raft list-peers ## Monitor target server Raft metrics and logs -See the section below on [monitoring WAL tests](/consul/docs/agent/wal-logstore/monitoring). +Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for details. -We recommend you leave the cluster in this configuration for days or weeks, assuming that you see no negative metrics or verification errors in logs to increase confidence in the WAL under varying workloads and during routine server restarts etc. +We recommend leaving the cluster in the test configuration for several days or weeks. If you do not record negative metrics or verification errors in logs, then you should have more confidence that WAL operates corerctly under varying workloads and during routine server restarts. -If you disabled Chef, Puppet or similar earlier, you may want to consider enabling it again while the test runs. Ensure that it will not "fix" the Consul configuration file and remove the different backend though. +If you disabled configuration management automation, consider reenabling it during the testing phase. Monitor the automation so that you can verify that it does not fix the Consul configuration file and remove the different backend. ## Next steps - If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). -- If you see no errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting a while after each and slowly rolling out. Once the majority of your servers are using WAL any bugs not yet found could result in cluster unavailability. +- If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers are using WAL, any bugs not yet discovered may result in cluster unavailability. -- If you wish to permanently enable `wal` on all servers, you'll need to follow the above steps on each one. Even if `backend = "wal"` is set in logs, servers will continue to use BoltDB if they find an existing raft.db file in the data dir. \ No newline at end of file +- If you wish to permanently enable `wal` on all servers, repeat the steps described in this topic for each server. Even if `backend = "wal"` is set in logs, servers continue to use BoltDB if they find an existing raft.db file in the data dir. \ No newline at end of file From 905bd70a7466539fed300326ba5648fb10ff9fa1 Mon Sep 17 00:00:00 2001 From: trujillo-adam Date: Thu, 23 Feb 2023 11:43:04 -0800 Subject: [PATCH 06/14] updates to the monitoring WAL backend topic --- .../docs/agent/wal-logstore/monitoring.mdx | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/monitoring.mdx b/website/content/docs/agent/wal-logstore/monitoring.mdx index 7d50fe4f2b8b..f1d670b01beb 100644 --- a/website/content/docs/agent/wal-logstore/monitoring.mdx +++ b/website/content/docs/agent/wal-logstore/monitoring.mdx @@ -8,41 +8,41 @@ description: >- # Monitor Raft metrics and logs for WAL -Throughout the testing period, it's important to monitor the cluster and especially the target server for signals that the WAL is not performing properly or is behaving incorrectly. +This topic describes how to monitor Raft metrics and logs if you are testing the WAL backend. We strongly recommend monitoring the Consul cluster, especially the target server, for evidence that the WAL backend is not functioning correctly. Refer to [Enable the experimental WAL LogStore backend](/consul/docs/agent/wal-logstore/index) for additional information about the WAL backend. !> **Upgrade warning:** The WAL LogStore backend is experimental. ## Monitor for checksum failures -If the log store verification fails on any server (whether it's running BoltDB or WAL backed), that is an **unrecoverable error**. It will look something like this in the logs: +Log store verification failures on any server, regardless of whether you are running the BoltDB or WAL backed, are unrecoverable error. Consul may report the following errors. -### Read Failures: Disk Corruption +### Read failures: Disk Corruption ```log hideClipboard 2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: storage corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... readChecksum=0x45... ``` -This indicates that the server read back different data to what it wrote to disk which signals corruption in the storage backend or filesystem. +This indicates that the server read back data that is different from what it wrote to disk, which signals corruption in the storage backend or filesystem. For convenience, we also increment a metric `consul.raft.logstore.verifier.read_checksum_failures` when this occurs. -### Write Failures: In-flight Corruption +### Write failures: In-flight Corruption -It's also possible that you might see a different kind of checksum error: +The following error indicates that the checksum on the follower did not match the leader when the leader _wrote_ the logs. The error implies that the corruption happened in the network or software and not the log store: ```log hideClipboard 2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: in-flight corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... followerWriteChecksum=0x45... ``` -This indicates that the checksum on the follower didn't match the leader when it _wrote_ the logs which implies that the corruption happened in the network or software and not the log store. This likely doesn't indicate an issue with the storage backend but should be handled the same way. +It is unlikely that this error indicates an issue with the storage backend, but you should take steps to resolve it. -For convenience, we also increment a metric `consul.raft.logstore.verifier.write_checksum_failures` when this occurs. +The `consul.raft.logstore.verifier.write_checksum_failures` increments when this error occurs. -### Handling Checksum Failures +## Resolve checksum failures -If either type of corruption is detected, the only safe way to handle it is to follow the [revert to BoltDB procedure](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server is already using BoltDB, the same is true although this is likely to indicate a latent bug in BoltDB or a bug in our verification code that needs to be investigated. +If either type of corruption is detected, complete the instruction for [reverting to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server already uses BoltDB, the errors likely idicate a latent bug in BoltDB or a bug in the verification code, but you should still follow the revert instructions. -Please report all verification failures via a [GitHub +Report all verification failures as a [GitHub issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). In your report, include the following: @@ -54,15 +54,15 @@ In your report, include the following: We recommend setting up an alert on Consul server logs containing `verification checksum FAILED` or on the `consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The sooner a corrupt server is handled, the lower the chance of any of the [potential risks](/consul/docs/agent/wal-logstore/enable#risks) causing problems in your cluster. -## Performance Metrics +## Performance metrics The key performance metrics to watch are: - `consul.raft.commitTime` measures the time to commit new writes on a quorum of servers. It should be the same or lower after deploying WAL. Even if WAL is - faster for your workload and hardware, it may not be reflected in commitTime - until enough followers are using it that the leader doesn't have to wait for a - slower one (one in a cluster of three, two in a cluster of five etc.). + faster for your workload and hardware, it may not be reflected in `commitTime` + until enough followers are using WAL so that the leader does not have to wait for + a slower follower a cluster of three or two slower followers in a cluster of five to catch up. - `consul.raft.rpc.appendEntries.storeLogs` measures the time spent persisting logs to disk on each _follower_. It should be the same or lower for @@ -71,14 +71,14 @@ The key performance metrics to watch are: - `consul.raft.replication.appendEntries.rpc` measures the time taken for each `AppendEntries` RPC from the leader's perspective. If this is significantly higher than `consul.raft.rpc.appendEntries` on the follower, it indicates a - known queuing issue in our raft library that will be fixed soon and is - essentially unrelated to the backend. The follower(s) with WAL enabled should - not be slower than the others. You can work out which follower each metric is - for by matching the `peer_id` label value to the server IDs listed by `consul - operator raft list-peers`. + known queuing issue in our Raft library that and is unrelated to the backend. + Followers with WAL enabled should not be slower than the others. You can + determine which follower is associated with which metric by running the + `consul operator raft list-peers` command and matching the + `peer_id` label value to the server IDs listed. - `consul.raft.compactLogs` measures the time take to truncate the logs after a - snapshot. WAL-enabled servers should not be slower than BoltDB ones. + snapshot. WAL-enabled servers should not be slower than BoltDB servers. - `consul.raft.leader.dispatchLog` measures the time spent persisting logs to disk on the _leader_. It is only relevant if a WAL-enabled server becomes a From ea0df5c86ca137f347935d46cc6ae6a1db55b4b7 Mon Sep 17 00:00:00 2001 From: trujillo-adam Date: Thu, 23 Feb 2023 12:03:36 -0800 Subject: [PATCH 07/14] updates for revert WAL topic --- .../agent/wal-logstore/revert-to-boltdb.mdx | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index 3f95f4f7786a..c15f84c9dd83 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -9,7 +9,7 @@ description: >- This topic describes revert your Consul storage backend from the experimental WAL LogStorage backend to the default BoltDB. -The overall process for reverting to BoltDB consists of the following steps. You will need to repeat these steps for all Consul servers. Notice this is very similar to enabling WAL. +The overall process for reverting to BoltDB consists of the following steps. Repeat the steps for all Consul servers that you need to revrt. 1. Select target server. 1. Stop target server gracefully. @@ -19,9 +19,9 @@ The overall process for reverting to BoltDB consists of the following steps. You ## Select target server -If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](/consul/docs/agent/wal-logstore/enable#risks), Consul Enterprise users with non-voting servers should select a non-voting server at first. +If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](/consul/docs/agent/wal-logstore/enable#risks), Consul Enterprise users with non-voting servers should begin the procedure with a non-voting server. -Retrieve the current state of the servers by running the following: +Run the following command to retrieve the current state of the servers: ```shell-session $ consul operator raft list-peers @@ -36,21 +36,19 @@ run the following command: $ systemctl stop consul ``` -If you have any configuration management automation that might interfere with this process (for example, Chef or Puppet), you must disable them until you have completely enabled WAL -as a storage backend. - +If your environment uses configuration management automation that might interfere with this process, such as Chef or Puppet, you must disable them until you have completely revereted the storage backend. ## Remove data directory from target server -Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. However, once you restart the server, you should not use the old data directory (`/data-dir/raft.bak`) for recovery, and will eventually need to delete it. +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. -Move the data directory. Replace `/data-dir` the value you have specified in your configuration file. +The following example moves the data atfrom `/data-dir` in the configuration file to `/temp/data-dir`. ```shell-session -$ mv /data-dir/raft /data-dir/raft.bak +$ mv /data-dir/raft /temp/data-dir/raft.bak ``` -When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory since the log must always be consistent with the snapshots to avoid undefined behavior or data loss. +When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is because the log must always be consistent with the snapshots to avoid undefined behavior or data loss. ## Update target server's configuration From 22a4aad857cfb57bbe16f09516b2fada2d32fa46 Mon Sep 17 00:00:00 2001 From: trujillo-adam Date: Fri, 24 Feb 2023 10:29:21 -0800 Subject: [PATCH 08/14] a few tweaks to overview and udpated metadescriptions --- .../docs/agent/wal-logstore/enable.mdx | 18 ++++++++---------- .../content/docs/agent/wal-logstore/index.mdx | 19 ++++++++++++++----- .../docs/agent/wal-logstore/monitoring.mdx | 3 +-- .../agent/wal-logstore/revert-to-boltdb.mdx | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index da935cfa0d87..8e39052e50e6 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -7,17 +7,15 @@ description: >- # Enable the experimental WAL LogStore backend -This topic describes how to safely configure and test the WAL backend in your Consul deployment. - -The overall process for enabling the WAL LogStore backend for one server consists of the following steps. You will need to repeat these steps for all Consul servers. +This topic describes how to safely configure and test the WAL backend in your Consul deployment. The following steps describe the general process for enabling the WAL `LogStore` backend for a single server. Repeat the steps for each Consul server. 1. Enable log verification. 1. Select target server to enable WAL. 1. Stop target server gracefully. 1. Remove data directory from target server. 1. Update target server's configuration. -1. Start target server. -1. Monitor target server raft metrics and logs. +1. Start the target server. +1. Monitor target server raft metrics and logs. !> **Upgrade warning:** The WAL LogStore backend is experimental. @@ -44,7 +42,7 @@ The likelihood of the following potential risks is low to very low: ## Enable log verification -You must enable log verification on any voting server in Enterprise and all servers in OSS becaise the leader writes verification checkpoints. +You must enable log verification on any voting server in Enterprise and all servers in OSS because the leader writes verification checkpoints. 1. On each voting server, add the following to the server's configuration file: @@ -57,10 +55,10 @@ You must enable log verification on any voting server in Enterprise and all serv } ``` -1. Restart each server to apply the changes. The `consul reload` is not sufficient to apply `raft_logstore` configuration changes. +1. Restart the server to apply the changes. The `consul reload` is not sufficient to apply `raft_logstore` configuration changes. 1. Run the `consul operator raft list-peers` command to wait for each server to become a healthy voter before moving on to the next. This may take a few minutes for large snapshots. -Whenc omplete, log entries for the servers should resemble the following status: +When complete, log entries for the servers should resemble the following status: ```log hideClipboard 2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c @@ -139,6 +137,6 @@ If you disabled configuration management automation, consider reenabling it duri - If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). -- If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers are using WAL, any bugs not yet discovered may result in cluster unavailability. +- If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers use WAL, any bugs not yet discovered may result in cluster unavailability. -- If you wish to permanently enable `wal` on all servers, repeat the steps described in this topic for each server. Even if `backend = "wal"` is set in logs, servers continue to use BoltDB if they find an existing raft.db file in the data dir. \ No newline at end of file +- If you wish to permanently enable WAL on all servers, repeat the steps described in this topic for each server. Even if `backend = "wal"` is set in logs, servers continue to use BoltDB if they find an existing raft.db file in the data directory. \ No newline at end of file diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx index f0e8cd3c8e5d..4e23d71de807 100644 --- a/website/content/docs/agent/wal-logstore/index.mdx +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -2,8 +2,7 @@ layout: docs page_title: WAL LogStore Backend Overview description: >- - Consul 1.15 introduced a new experimental storage backend option. Learn how to - configure and test it out in a safe way. + The experimental WAL (write-ahead log) LogStore backend shipped in Consul 1.15 is intended to replace the BoltDB backend, improving performance and log storage issues. --- # Experimental WAL LogStore backend overview @@ -16,16 +15,26 @@ This topic provides an overview of the experimental WAL (write-ahead log) LogSto WAL implements a traditional log with rotating, append-only log files. WAL resolves many issues with the existing `LogStore` provided by the BoltDB backend. The BoltDB `LogStore` is a copy-on-write BTree, which is not optimized for append-only workloads. +### BoltDB storage scalability issues + The existing BoltDB log store inefficiently stores append-only logs to disk because it was designed as a full key-value database. It is a single file that only ever grows. Deleting the oldest logs, which Consul does regularly when it makes new snapshots of the state, leaves free space in the file. The free space must be tracked so that Consul can reuse it on future writes. By contrast, a simple segmented log can delete the oldest log files from disk. -When BoltDB is used as a log backend, a sudden burst of writes at a rate two to three times higher than the normal volume can suddenly cause the log file to grow to several times its steady-state size. After Consul takes the next snapshot and truncates the oldest logs, the resulting file is mostly empty space. To track the free space, Consul must write extra metadata to disk with every write. The metadata is proportional to the amount of free pages, so after a large burst write latencies tend to increase. In some cases, the latencies cause serious performance degradation to the cluster. +A burst of writes at double or triple the normal volume can suddenly cause the log file to grow to several times its steady-state size. After Consul takes the next snapshot and truncates the oldest logs, the resulting file is mostly empty space. + +To track the free space, Consul must write extra metadata to disk with every write. The metadata is proportional to the amount of free pages, so after a large burst write latencies tend to increase. In some cases, the latencies cause serious performance degradation to the cluster. + +To mitigate risks associated with sudden bursts of log data, Consul prevents too many logs from accumulating in the `LogStore`. Significantly larger BoltDB files are generally slower to process because the file is structured as a tree. As a result, Consul must iterate through the log for each write. -To mitigate risks associated with sudden bursts of log data, Consul prevents too many logs from accumulating in the LogStore. Significantly larger BoltDB files are generally slower to process because the file is structured as a tree. As a result, Consul must iterate through the log for each write. But the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the many of Consul's default options asssociated with snapshots, truncating logs, and keeping the log history have been aggressively set toward keeping BoltDT small rather than uisng disk IO more efficiently. +But the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the many of Consul's default options asssociated with snapshots, truncating logs, and keeping the log history aggressively keep BoltDT small rather than uisng disk IO more efficiently. Other reliability issues, such as [raft replication capacity issues](/consul/docs/agent/telemetry#raft-replication-capacity-issues), are also related to balancing the size of the BoltDB log store and length of snapshots. +### WAL approaches storage issues differently + Although some users may not experience a significant difference in performance, the WAL backend avoids performance concerns associated with BoltDB. When directly measured, WAL is more performant than BoltDB because it solves a simpler storage problem than BoltDB was designed to solve. For example, WAL can commit a single log entry with one `fsync` instead -of two and tends to write two to three fewer bytes to the disk to commit the `fsync`. The primary benefit is that retaining more logs does not affect write performance. As a result, strategies for reducing disk IO with slower snapshots or for keeping logs to permit slower followers to catch up with cluster state are all possible. +of two and tends to write two to three fewer bytes to the disk to commit the `fsync`. + +The primary benefit is that retaining more logs does not affect write performance. As a result, strategies for reducing disk IO with slower snapshots or for keeping logs to permit slower followers to catch up with cluster state are all possible. ## WAL quality assurance diff --git a/website/content/docs/agent/wal-logstore/monitoring.mdx b/website/content/docs/agent/wal-logstore/monitoring.mdx index f1d670b01beb..98a5d974d3e7 100644 --- a/website/content/docs/agent/wal-logstore/monitoring.mdx +++ b/website/content/docs/agent/wal-logstore/monitoring.mdx @@ -2,8 +2,7 @@ layout: docs page_title: Monitor Raft metrics and logs for WAL description: >- - Consul 1.15 introduced a new experimental storage backend option. Learn how to - configure and test it out in a safe way. + Learn how to monitor Raft metrics emitted the experimental WAL (write-ahead log) LogStore backend shipped in Consul 1.15. --- # Monitor Raft metrics and logs for WAL diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index c15f84c9dd83..8d2ca5041bca 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -2,7 +2,7 @@ layout: docs page_title: Revert to BoltDB description: >- - Learn how to safely configure and test the experimental WAL backend in your Consul deployment. + Learn how to revert Consul to the BoltDB backend after enabled the WAL (write-ahead log) LogStore backend shipped in Consul 1.15. --- # Revert storage backend to BoltDB from WAL From c3e53c3b30e1938ded069be8111ecc78c4e277b6 Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Fri, 24 Feb 2023 10:37:42 -0800 Subject: [PATCH 09/14] Apply suggestions from code review Co-authored-by: Paul Banks --- .../docs/agent/config/config-files.mdx | 5 ++--- .../docs/agent/wal-logstore/enable.mdx | 20 +++++++++---------- .../content/docs/agent/wal-logstore/index.mdx | 4 ++-- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index 9a2df3464a37..b38faa96b247 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1650,14 +1650,13 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) to `true` to correctly enable intervals. We recommend using an interval between `30s` and `5m`. The performance overhead is insignificant if the - interval is set to `5m` or less. We recommend setting an interval to - control how frequently the report logs appear for human observation. + interval is set to `5m` or less. - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for Raft's `boltdb` backend. It has no effect if the `backend` is not `boltdb`. - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to - `true` to disable storing BoltDB freelist to disk within the + `true` to disable storing BoltDB's freelist to disk within the `raft.db` file. Disabling freelist syncs reduces the disk IO required for write operations, but could potentially increase start up time because Consul must scan the database to find free space diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index 8e39052e50e6..f554fdeb5583 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -36,8 +36,8 @@ The likelihood of the following potential risks is low to very low: - If WAL corrupts data on a Consul server agent, the server data cannot be recovered. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. - WAL may corrupt data or contain a defect that causes the server to panic and crash. WAL may not restart if the defect is recurs when WAL reads from the logs on startup. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. - - Clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatching tokens, if WAL corrupts data. This is unlikely even if a recuring defect cause WAL to corrupt data because replication uses objects cached in memory rather than reads from disk. Restore the server to resolve the issue. - - If you enable a Consul OSS server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this scenario only a restore from backup would recover a completely un-corrupt state. Test WAL on a non-voting server in Enterprise to preven this scenario. + - Clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatching tokens, if WAL corrupts data. This is unlikely even if a recuring defect cause WAL to corrupt data because replication uses objects cached in memory rather than reads from disk. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. + - If you enable a Consul OSS server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this case only a restore from backup would recover a completely un-corrupt state. Test WAL on a non-voting server in Enterprise to prevent this. You can add a new non-voting server to the cluster to test with if there are no existing ones. ## Enable log verification @@ -66,7 +66,7 @@ When complete, log entries for the servers should resemble the following status: ## Select target server to enable WAL -If you are using Consul OSS or Consul Enterprise without non-voting servers, select a follower server to enable WAL. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should first select a non-voting server. +If you are using Consul OSS or Consul Enterprise without non-voting servers, select a follower server to enable WAL. As noted in [Risks](#risks), Consul Enterprise users with non-voting servers should first select a non-voting server, or consider adding another server as a non-voter to test on. Retrieve the current state of the servers by running the following command: @@ -76,7 +76,7 @@ $ consul operator raft list-peers ## Stop target server -Stop the target server gracefully. For example, if you are using `systemcmd`, +Stop the target server gracefully. For example, if you are using `systemd`, run the following command: ```shell-session @@ -89,10 +89,10 @@ If your environment uses configuration management automation that might interfer Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. -The following example moves the data atfrom `/data-dir` in the configuration file to `/temp/data-dir`. +The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.bak`. ```shell-session -$ mv /data-dir/raft /temp/data-dir/raft.bak +$ mv /data-dir/raft /data-dir/raft.bak ``` When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is bedause the log must always be consistent with the snapshots to avoid undefined behavior or data loss. @@ -113,7 +113,7 @@ raft_logstore { ## Start target server -Start the target server. For example, if you are using `systemcmd`, run the following command: +Start the target server. For example, if you are using `systemd`, run the following command: ```shell-session $ systemctl start consul @@ -129,13 +129,13 @@ $ consul operator raft list-peers Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for details. -We recommend leaving the cluster in the test configuration for several days or weeks. If you do not record negative metrics or verification errors in logs, then you should have more confidence that WAL operates corerctly under varying workloads and during routine server restarts. +We recommend leaving the cluster in the test configuration for several days or weeks assuming no errors observed. An extended test provides more confidence that WAL operates correctly under varying workloads and during routine server restarts. If any errors are observed, you should end the test immediately and report them. -If you disabled configuration management automation, consider reenabling it during the testing phase. Monitor the automation so that you can verify that it does not fix the Consul configuration file and remove the different backend. +If you disabled configuration management automation, consider reenabling it during the testing phase to pick up other updates for the host. You must ensure that it does not revert the Consul configuration file and remove the different backend configuration. One way to do this is add the `raft_logstore` block to a separate file that is not managed by your automation. This file can either be added to the directory if [`-config-dir`](/consul/docs/agent/config/cli-flags#_config_dir) is used or as an additional [`-config-file`](/consul/docs/agent/config/cli-flags#_config_file) argument. ## Next steps -- If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). +- If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should immediately follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). Please report failure via GitHub. - If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers use WAL, any bugs not yet discovered may result in cluster unavailability. diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx index 4e23d71de807..e4ad3c9774ca 100644 --- a/website/content/docs/agent/wal-logstore/index.mdx +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -13,11 +13,11 @@ This topic provides an overview of the experimental WAL (write-ahead log) LogSto ## WAL versus BoltDB -WAL implements a traditional log with rotating, append-only log files. WAL resolves many issues with the existing `LogStore` provided by the BoltDB backend. The BoltDB `LogStore` is a copy-on-write BTree, which is not optimized for append-only workloads. +WAL implements a traditional log with rotating, append-only log files. WAL resolves many issues with the existing `LogStore` provided by the BoltDB backend. The BoltDB `LogStore` is a copy-on-write BTree, which is not optimized for append-only, write-heavy workloads. ### BoltDB storage scalability issues -The existing BoltDB log store inefficiently stores append-only logs to disk because it was designed as a full key-value database. It is a single file that only ever grows. Deleting the oldest logs, which Consul does regularly when it makes new snapshots of the state, leaves free space in the file. The free space must be tracked so that Consul can reuse it on future writes. By contrast, a simple segmented log can delete the oldest log files from disk. +The existing BoltDB log store inefficiently stores append-only logs to disk because it was designed as a full key-value database. It is a single file that only ever grows. Deleting the oldest logs, which Consul does regularly when it makes new snapshots of the state, leaves free space in the file. The free space must be tracked in a `freelist` so that BoltDB can reuse it on future writes. By contrast, a simple segmented log can delete the oldest log files from disk. A burst of writes at double or triple the normal volume can suddenly cause the log file to grow to several times its steady-state size. After Consul takes the next snapshot and truncates the oldest logs, the resulting file is mostly empty space. From 5b7e1529889b16b15ff214f4a64371a2282ea831 Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Fri, 24 Feb 2023 10:39:17 -0800 Subject: [PATCH 10/14] make revert docs consistent with enable --- .../content/docs/agent/wal-logstore/revert-to-boltdb.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index 8d2ca5041bca..8419db1dd28a 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -29,7 +29,7 @@ $ consul operator raft list-peers ## Stop target server gracefully -Stop the target server gracefully. For example, if you are using `systemcmd`, +Stop the target server gracefully. For example, if you are using `systemd`, run the following command: ```shell-session @@ -42,10 +42,10 @@ If your environment uses configuration management automation that might interfer Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. -The following example moves the data atfrom `/data-dir` in the configuration file to `/temp/data-dir`. +The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.bak`. ```shell-session -$ mv /data-dir/raft /temp/data-dir/raft.bak +$ mv /data-dir/raft /data-dir/raft.bak ``` When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is because the log must always be consistent with the snapshots to avoid undefined behavior or data loss. @@ -66,7 +66,7 @@ raft_logstore { ## Start target server -Start the target server. For example, if you are using `systemcmd`, run the following command: +Start the target server. For example, if you are using `systemd`, run the following command: ```shell-session $ systemctl start consul From 3614ab14028ebee6cbccef9c888c5a03f6b55b8b Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Fri, 24 Feb 2023 13:44:46 -0800 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: Paul Banks --- website/content/docs/agent/wal-logstore/enable.mdx | 2 +- website/content/docs/agent/wal-logstore/index.mdx | 6 +++--- website/content/docs/agent/wal-logstore/monitoring.mdx | 10 +++++----- .../docs/agent/wal-logstore/revert-to-boltdb.mdx | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index f554fdeb5583..fa0e6c0f6bce 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -27,7 +27,7 @@ This topic describes how to safely configure and test the WAL backend in your Co We recommend taking the following additional measures: - Take a snapshot prior to testing. -- Monitor Consul server metrics and logs and sett an alert on specific log events that occur when WAL is enabled. +- Monitor Consul server metrics and logs and set an alert on specific log events that occur when WAL is enabled. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information. - Enable WAL in a pre-production environment and run it for a several days before enabling it in production. ## Risks diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx index e4ad3c9774ca..410904492ecd 100644 --- a/website/content/docs/agent/wal-logstore/index.mdx +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -27,7 +27,7 @@ To mitigate risks associated with sudden bursts of log data, Consul prevents too But the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the many of Consul's default options asssociated with snapshots, truncating logs, and keeping the log history aggressively keep BoltDT small rather than uisng disk IO more efficiently. -Other reliability issues, such as [raft replication capacity issues](/consul/docs/agent/telemetry#raft-replication-capacity-issues), are also related to balancing the size of the BoltDB log store and length of snapshots. +Other reliability issues, such as [raft replication capacity issues](/consul/docs/agent/telemetry#raft-replication-capacity-issues), are much simpler to solve without the performance concerns caused by storing more logs in BoltDB. ### WAL approaches storage issues differently @@ -40,7 +40,7 @@ The primary benefit is that retaining more logs does not affect write performanc The WAL backend has been tested thoroughly during development: -- Every component in the WAL, such as [metadata management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), [log file encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) to actual [file-system interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) were abstracted so unit tests can simulate difficult-to-reproduce disk failures. +- Every component in the WAL, such as [metadata management](https://github.com/hashicorp/raft-wal/blob/main/types/meta.go), [log file encoding](https://github.com/hashicorp/raft-wal/blob/main/types/segment.go) to actual [file-system interaction](https://github.com/hashicorp/raft-wal/blob/main/types/vfs.go) are abstracted so unit tests can simulate difficult-to-reproduce disk failures. - We used the [application-level intelligent crash explorer (ALICE)](https://github.com/hashicorp/raft-wal/blob/main/alice/README.md) to exhaustively simulate thousands of possible crash failure scenarios. WAL correctly recovered from all scenarios. @@ -48,4 +48,4 @@ The WAL backend has been tested thoroughly during development: We are aware of how complex and critical disk-persistence is for your data. -Our goal is that many users at all degrees of scale try WAL in their environments after upgrading to 1.15 or later and report success or failure so that we can confidently replace BoltDB before as the default for new clusters. \ No newline at end of file +We hope that many users at different scales will try WAL in their environments after upgrading to 1.15 or later and report success or failure so that we can confidently replace BoltDB as the default for new clusters in a future release. \ No newline at end of file diff --git a/website/content/docs/agent/wal-logstore/monitoring.mdx b/website/content/docs/agent/wal-logstore/monitoring.mdx index 98a5d974d3e7..ea63cf4c69a0 100644 --- a/website/content/docs/agent/wal-logstore/monitoring.mdx +++ b/website/content/docs/agent/wal-logstore/monitoring.mdx @@ -13,7 +13,7 @@ This topic describes how to monitor Raft metrics and logs if you are testing the ## Monitor for checksum failures -Log store verification failures on any server, regardless of whether you are running the BoltDB or WAL backed, are unrecoverable error. Consul may report the following errors. +Log store verification failures on any server, regardless of whether you are running the BoltDB or WAL backed, are unrecoverable errors. Consul may report the following errors in logs. ### Read failures: Disk Corruption @@ -21,25 +21,25 @@ Log store verification failures on any server, regardless of whether you are run 2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: storage corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... readChecksum=0x45... ``` -This indicates that the server read back data that is different from what it wrote to disk, which signals corruption in the storage backend or filesystem. +This indicates that the server read back data that is different from what it wrote to disk. This indicates corruption in the storage backend or filesystem. For convenience, we also increment a metric `consul.raft.logstore.verifier.read_checksum_failures` when this occurs. ### Write failures: In-flight Corruption -The following error indicates that the checksum on the follower did not match the leader when the leader _wrote_ the logs. The error implies that the corruption happened in the network or software and not the log store: +The following error indicates that the checksum on the follower did not match the leader when the follower received the logs. The error implies that the corruption happened in the network or software and not the log store: ```log hideClipboard 2022-11-15T22:41:23.546Z [ERROR] agent.raft.logstore: verification checksum FAILED: in-flight corruption rangeStart=1234 rangeEnd=3456 leaderChecksum=0xc1... followerWriteChecksum=0x45... ``` -It is unlikely that this error indicates an issue with the storage backend, but you should take steps to resolve it. +It is unlikely that this error indicates an issue with the storage backend, but you should take the same steps to resolve and report it. The `consul.raft.logstore.verifier.write_checksum_failures` increments when this error occurs. ## Resolve checksum failures -If either type of corruption is detected, complete the instruction for [reverting to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server already uses BoltDB, the errors likely idicate a latent bug in BoltDB or a bug in the verification code, but you should still follow the revert instructions. +If either type of corruption is detected, complete the instructions for [reverting to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server already uses BoltDB, the errors likely indicate a latent bug in BoltDB or a bug in the verification code, but you should still follow the revert instructions. Report all verification failures as a [GitHub issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index 8419db1dd28a..c8bc8b1baca2 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -7,9 +7,9 @@ description: >- # Revert storage backend to BoltDB from WAL -This topic describes revert your Consul storage backend from the experimental WAL LogStorage backend to the default BoltDB. +This topic describes how to revert your Consul storage backend from the experimental WAL LogStore backend to the default BoltDB. -The overall process for reverting to BoltDB consists of the following steps. Repeat the steps for all Consul servers that you need to revrt. +The overall process for reverting to BoltDB consists of the following steps. Repeat the steps for all Consul servers that you need to revert. 1. Select target server. 1. Stop target server gracefully. @@ -80,7 +80,7 @@ $ consul operator raft list-peers ### Clean up old data directories -If necessary, cleanup any `raft.bak` directories. Replace `/data-dir` the value you have specified in your configuration file. +If necessary, cleanup any `raft.wal.bak` directories. Replace `/data-dir` with the value you have specified in your configuration file. ```shell-session $ rm /data-dir/raft.bak From e98acef02c6483e920b78489fbbaa27adb3bca40 Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Fri, 24 Feb 2023 13:51:38 -0800 Subject: [PATCH 12/14] address feedback --- .../content/docs/agent/wal-logstore/enable.mdx | 6 ++++-- website/content/docs/agent/wal-logstore/index.mdx | 9 +++------ .../docs/agent/wal-logstore/revert-to-boltdb.mdx | 15 ++------------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index fa0e6c0f6bce..94f9e2d4827f 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -7,7 +7,9 @@ description: >- # Enable the experimental WAL LogStore backend -This topic describes how to safely configure and test the WAL backend in your Consul deployment. The following steps describe the general process for enabling the WAL `LogStore` backend for a single server. Repeat the steps for each Consul server. +This topic describes how to safely configure and test the WAL backend in your Consul deployment. + +The overall process for enabling the WAL LogStore backend for one server consists of the following steps. We recommend only enabling on a single server at first in production environments. If you eventually choose to expand the test to further servers, you will need to repeat these steps for each one. 1. Enable log verification. 1. Select target server to enable WAL. @@ -17,7 +19,7 @@ This topic describes how to safely configure and test the WAL backend in your Co 1. Start the target server. 1. Monitor target server raft metrics and logs. -!> **Upgrade warning:** The WAL LogStore backend is experimental. +!> **Experimental feature:** The WAL LogStore backend is experimental. ## Requirements diff --git a/website/content/docs/agent/wal-logstore/index.mdx b/website/content/docs/agent/wal-logstore/index.mdx index 410904492ecd..914d6602a9e9 100644 --- a/website/content/docs/agent/wal-logstore/index.mdx +++ b/website/content/docs/agent/wal-logstore/index.mdx @@ -9,7 +9,7 @@ description: >- This topic provides an overview of the experimental WAL (write-ahead log) LogStore backend. -!> **Upgrade warning:** The WAL LogStore backend is experimental. +!> **Experimental feature:** The WAL LogStore backend is experimental. ## WAL versus BoltDB @@ -23,7 +23,7 @@ A burst of writes at double or triple the normal volume can suddenly cause the l To track the free space, Consul must write extra metadata to disk with every write. The metadata is proportional to the amount of free pages, so after a large burst write latencies tend to increase. In some cases, the latencies cause serious performance degradation to the cluster. -To mitigate risks associated with sudden bursts of log data, Consul prevents too many logs from accumulating in the `LogStore`. Significantly larger BoltDB files are generally slower to process because the file is structured as a tree. As a result, Consul must iterate through the log for each write. +To mitigate risks associated with sudden bursts of log data, Consul tries to limit lots of logs from accumulating in the LogStore. Significantly larger BoltDB files are slower to append to because the tree is deeper and freelist larger. For this reason, Consul's default options associated with snapshots, truncating logs, and keeping the log history have been aggressively set toward keeping BoltDB small rather than using disk IO optimally. But the larger the file, the more likely it is to have a large freelist or suddenly form one after a burst of writes. For this reason, the many of Consul's default options asssociated with snapshots, truncating logs, and keeping the log history aggressively keep BoltDT small rather than uisng disk IO more efficiently. @@ -31,10 +31,7 @@ Other reliability issues, such as [raft replication capacity issues](/consul/doc ### WAL approaches storage issues differently -Although some users may not experience a significant difference in performance, the WAL backend avoids performance concerns associated with BoltDB. When directly measured, WAL is more performant than BoltDB because it solves a simpler storage problem than BoltDB was designed to solve. For example, WAL can commit a single log entry with one `fsync` instead -of two and tends to write two to three fewer bytes to the disk to commit the `fsync`. - -The primary benefit is that retaining more logs does not affect write performance. As a result, strategies for reducing disk IO with slower snapshots or for keeping logs to permit slower followers to catch up with cluster state are all possible. +When directly measured, WAL is more performant than BoltDB because it solves a simpler storage problem. Despite this, some users may not notice a significant performance improvement from the upgrade with the same configuration and workload. In this case, the benefit of WAL is that retaining more logs does not affect write performance. As a result, strategies for reducing disk IO with slower snapshots or for keeping logs to permit slower followers to catch up with cluster state are all possible, increasing the reliability of the deployment. ## WAL quality assurance diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index c8bc8b1baca2..6760ca2a92f1 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -11,22 +11,11 @@ This topic describes how to revert your Consul storage backend from the experime The overall process for reverting to BoltDB consists of the following steps. Repeat the steps for all Consul servers that you need to revert. -1. Select target server. 1. Stop target server gracefully. 1. Remove data directory from target server. 1. Update target server's configuration. 1. Start target server. -## Select target server - -If you are using Consul OSS or Consul Enterprise without non-voting servers, select one of the follower servers. As noted in [Risks](/consul/docs/agent/wal-logstore/enable#risks), Consul Enterprise users with non-voting servers should begin the procedure with a non-voting server. - -Run the following command to retrieve the current state of the servers: - -```shell-session -$ consul operator raft list-peers -``` - ## Stop target server gracefully Stop the target server gracefully. For example, if you are using `systemd`, @@ -42,10 +31,10 @@ If your environment uses configuration management automation that might interfer Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. -The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.bak`. +The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.wal.bak`. ```shell-session -$ mv /data-dir/raft /data-dir/raft.bak +$ mv /data-dir/raft /data-dir/raft.wal.bak ``` When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is because the log must always be consistent with the snapshots to avoid undefined behavior or data loss. From ba80be346d634abf9ff343869625294af66d955b Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Fri, 24 Feb 2023 13:52:50 -0800 Subject: [PATCH 13/14] address final feedback --- website/content/docs/agent/wal-logstore/enable.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index 94f9e2d4827f..21d6dde41115 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -44,7 +44,7 @@ The likelihood of the following potential risks is low to very low: ## Enable log verification -You must enable log verification on any voting server in Enterprise and all servers in OSS because the leader writes verification checkpoints. +You must enable log verification on all voting servers in Enterprise and all servers in OSS because the leader writes verification checkpoints. 1. On each voting server, add the following to the server's configuration file: @@ -57,10 +57,10 @@ You must enable log verification on any voting server in Enterprise and all serv } ``` -1. Restart the server to apply the changes. The `consul reload` is not sufficient to apply `raft_logstore` configuration changes. +1. Restart the server to apply the changes. The `consul reload` command is not sufficient to apply `raft_logstore` configuration changes. 1. Run the `consul operator raft list-peers` command to wait for each server to become a healthy voter before moving on to the next. This may take a few minutes for large snapshots. -When complete, log entries for the servers should resemble the following status: +When complete, the server's logs should contain verifier reports like these: ```log hideClipboard 2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c From ad8dab00a92b0b93dc5c0f3c0d302ea252514227 Mon Sep 17 00:00:00 2001 From: Tu Nguyen Date: Sun, 26 Feb 2023 19:20:14 -0800 Subject: [PATCH 14/14] Apply suggestions from code review Co-authored-by: Jeff Boruszak <104028618+boruszak@users.noreply.github.com> --- .../docs/agent/config/config-files.mdx | 48 ++++++++----------- .../docs/agent/wal-logstore/enable.mdx | 27 +++++------ .../docs/agent/wal-logstore/monitoring.mdx | 18 +++---- .../agent/wal-logstore/revert-to-boltdb.mdx | 6 +-- 4 files changed, 46 insertions(+), 53 deletions(-) diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index b38faa96b247..2992f562af00 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -1586,22 +1586,22 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." ## Raft Parameters -- `raft_boltdb` ((#raft_boltdb)) **These fields are deprecated in Consul 1.15.0. +- `raft_boltdb` ((#raft_boltdb)) **These fields are deprecated in Consul v1.15.0. Use [`raft_logstore`](#raft_logstore) instead.** This is a nested - object that allows configuring options for Raft's BoltDB based log store. + object that allows configuring options for Raft's BoltDB-based log store. - - `NoFreelistSync` **This field is deprecated in Consul 1.15.0. Use the + - `NoFreelistSync` **This field is deprecated in Consul v1.15.0. Use the [`raft_logstore.boltdb.no_freelist_sync`](#raft_logstore_boltdb_no_freelist_sync) field - instead.** Setting this to `true` will disable syncing the BoltDB freelist - to disk within the raft.db file. Not syncing the freelist to disk will - reduce disk IO required for write operations at the expense of potentially + instead.** Setting this to `true` disables syncing the BoltDB freelist + to disk within the raft.db file. Not syncing the freelist to disk + reduces disk IO required for write operations at the expense of potentially increasing start up time due to needing to scan the db to discover where the free space resides within the file. - `raft_logstore` ((#raft_logstore)) This is a nested object that allows configuring options for Raft's LogStore component which is used to persist logs and crucial Raft state on disk during writes. This was added in Consul - 1.15. + v1.15.0. - `backend` ((#raft_logstore_backend)) Specifies which storage engine to use to persist logs. Valid options are `boltdb` or `wal`. Default @@ -1610,27 +1610,21 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) for more information. - - `disable_log_cache` ((#raft_logstore_disable_log_cache)) This allows - disabling of the in-memory cache of recent logs. This exists mostly for - performance testing purposes. In theory the log cache prevents disk reads - for recent logs. In practice recent logs are still in OS page cache so tend - not to be slow to read using either backend. We recommend leaving it enabled - for now as we've not measured a significant improvement in any metric by - disabling. + - `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them. - `verification` ((#raft_logstore_verification)) This is a nested object that - allows configuring online verification of the LogStore. Verification + allows configuring the online verification of the LogStore. Verification provides additional assurances that LogStore backends are correctly storing - data. It imposes very low overhead on servers and is safe to run in - production, however it's mostly useful when evaluating a new backend + data. It imposes low overhead on servers and is safe to run in + production. It is most useful when evaluating a new backend implementation. Verification must be enabled on the leader to have any effect and can be - used with any backend. When enabled, the leader will periodically write a - special "checkpoint" log message including checksums of all log entries + used with any backend. When enabled, the leader periodically writes a + special "checkpoint" log message that includes the checksums of all log entries written to Raft since the last checkpoint. Followers that have verification - enabled will run a background task for each checkpoint that reads all logs - directly from the LogStore and recomputes the checksum. A report is output + enabled run a background task for each checkpoint that reads all logs + directly from the LogStore and then recomputes the checksum. A report is output as an INFO level log for each checkpoint. Checksum failure should never happen and indicate unrecoverable corruption @@ -1643,13 +1637,13 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." - `enabled` ((#raft_logstore_verification_enabled)) - Set to `true` to allow this Consul server to write and verify log verification checkpoints - when it is elected leader. + when elected leader. - `interval` ((#raft_logstore_verification_interval)) - Specifies the time interval between checkpoints. There is no default value. You must configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) to `true` to correctly enable intervals. We recommend using an interval - between `30s` and `5m`. The performance overhead is insignificant if the + between `30s` and `5m`. The performance overhead is insignificant when the interval is set to `5m` or less. - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for @@ -1668,11 +1662,11 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'." - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value that represents the target size in MB for each segment file before - rolling to a new segment. The default is `64` and is suitable for - most deployments. A smaller value may use less disk space because you - can reclaim space by deleting old segments sooner, but a smaller segment + rolling to a new segment. The default value is `64` and is suitable for + most deployments. While a smaller value may use less disk space because you + can reclaim space by deleting old segments sooner, the smaller segment that results may affect performance because safely rotating to a new file more - frequently could impact tail latencies. Larger values are unlikely + frequently can impact tail latencies. Larger values are unlikely to improve performance significantly. We recommend using this configuration for performance testing purposes. diff --git a/website/content/docs/agent/wal-logstore/enable.mdx b/website/content/docs/agent/wal-logstore/enable.mdx index 21d6dde41115..a4a89f70c86e 100644 --- a/website/content/docs/agent/wal-logstore/enable.mdx +++ b/website/content/docs/agent/wal-logstore/enable.mdx @@ -9,7 +9,7 @@ description: >- This topic describes how to safely configure and test the WAL backend in your Consul deployment. -The overall process for enabling the WAL LogStore backend for one server consists of the following steps. We recommend only enabling on a single server at first in production environments. If you eventually choose to expand the test to further servers, you will need to repeat these steps for each one. +The overall process for enabling the WAL LogStore backend for one server consists of the following steps. In production environments, we recommend starting by enabling the backend on a single server . If you eventually choose to expand the test to further servers, you must repeat these steps for each one. 1. Enable log verification. 1. Select target server to enable WAL. @@ -23,24 +23,23 @@ The overall process for enabling the WAL LogStore backend for one server consist ## Requirements -- Consul 1.15 or later is required for all servers in the datacenter. Refer to the [standard upgrade procedure](/consul/docs/upgrading/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x) for additional information. +- Consul v1.15 or later is required for all servers in the datacenter. Refer to the [standard upgrade procedure](/consul/docs/upgrading/general-process) and the [1.15 upgrade notes](/consul/docs/upgrading/upgrade-specific#consul-1-15-x) for additional information. - A Consul cluster with at least three nodes are required to safely test the WAL backend without downtime. We recommend taking the following additional measures: - Take a snapshot prior to testing. -- Monitor Consul server metrics and logs and set an alert on specific log events that occur when WAL is enabled. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information. +- Monitor Consul server metrics and logs, and set an alert on specific log events that occur when WAL is enabled. Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for more information. - Enable WAL in a pre-production environment and run it for a several days before enabling it in production. ## Risks -The likelihood of the following potential risks is low to very low: +While their likelihood remains low to very low, be aware of the following risks before implementing the WAL backend: - If WAL corrupts data on a Consul server agent, the server data cannot be recovered. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. - - WAL may corrupt data or contain a defect that causes the server to panic and crash. WAL may not restart if the defect is recurs when WAL reads from the logs on startup. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. - - Clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatching tokens, if WAL corrupts data. This is unlikely even if a recuring defect cause WAL to corrupt data because replication uses objects cached in memory rather than reads from disk. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. - - If you enable a Consul OSS server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this case only a restore from backup would recover a completely un-corrupt state. Test WAL on a non-voting server in Enterprise to prevent this. You can add a new non-voting server to the cluster to test with if there are no existing ones. - + - WAL may corrupt data or contain a defect that causes the server to panic and crash. WAL may not restart if the defect recurs when WAL reads from the logs on startup. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. + - If WAL corrupts data, clients may read corrupted data from the Consul server, such as invalid IP addresses or unmatched tokens. This outcome is unlikely even if a recurring defect causes WAL to corrupt data because replication uses objects cached in memory instead of reads from disk. Restart the server with an empty data directory and reload its state from the leader to resolve the issue. + - If you enable a Consul OSS server to use WAL or enable WAL on a voting server with Consul Enterprise, WAL may corrupt the server's state, become the leader, and replicate the corrupted state to all other servers. In this case, restoring from backup is required to recover a completely uncorrupted state. Test WAL on a non-voting server in Enterprise to prevent this outcome. You can add a new non-voting server to the cluster to test with if there are no existing ones. ## Enable log verification @@ -60,7 +59,7 @@ You must enable log verification on all voting servers in Enterprise and all ser 1. Restart the server to apply the changes. The `consul reload` command is not sufficient to apply `raft_logstore` configuration changes. 1. Run the `consul operator raft list-peers` command to wait for each server to become a healthy voter before moving on to the next. This may take a few minutes for large snapshots. -When complete, the server's logs should contain verifier reports like these: +When complete, the server's logs should contain verifier reports that appear like the following example: ```log hideClipboard 2023-01-31T14:44:31.174Z [INFO] agent.server.raft.logstore.verifier: verification checksum OK: elapsed=488.463268ms leaderChecksum=f15db83976f2328c rangeEnd=357802 rangeStart=298132 readChecksum=f15db83976f2328c @@ -89,7 +88,7 @@ If your environment uses configuration management automation that might interfer ## Remove data directory from target server -Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend moving it in cases where you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.bak`. @@ -97,7 +96,7 @@ The following example assumes the `data_dir` in the server's configuration is `/ $ mv /data-dir/raft /data-dir/raft.bak ``` -When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is bedause the log must always be consistent with the snapshots to avoid undefined behavior or data loss. +When switching backends, you must always remove _the entire raft directory_, not just the `raft.db` file or `wal` directory. The log must always be consistent with the snapshots to avoid undefined behavior or data loss. ## Update target server configuration @@ -131,13 +130,13 @@ $ consul operator raft list-peers Refer to [Monitor Raft metrics and logs for WAL](/consul/docs/agent/wal-logstore/monitoring) for details. -We recommend leaving the cluster in the test configuration for several days or weeks assuming no errors observed. An extended test provides more confidence that WAL operates correctly under varying workloads and during routine server restarts. If any errors are observed, you should end the test immediately and report them. +We recommend leaving the cluster in the test configuration for several days or weeks, as long as you observe no errors. An extended test provides more confidence that WAL operates correctly under varied workloads and during routine server restarts. If you observe any errors, end the test immediately and report them. -If you disabled configuration management automation, consider reenabling it during the testing phase to pick up other updates for the host. You must ensure that it does not revert the Consul configuration file and remove the different backend configuration. One way to do this is add the `raft_logstore` block to a separate file that is not managed by your automation. This file can either be added to the directory if [`-config-dir`](/consul/docs/agent/config/cli-flags#_config_dir) is used or as an additional [`-config-file`](/consul/docs/agent/config/cli-flags#_config_file) argument. +If you disabled configuration management automation, consider reenabling it during the testing phase to pick up other updates for the host. You must ensure that it does not revert the Consul configuration file and remove the altered backend configuration. One way to do this is add the `raft_logstore` block to a separate file that is not managed by your automation. This file can either be added to the directory if [`-config-dir`](/consul/docs/agent/config/cli-flags#_config_dir) is used or as an additional [`-config-file`](/consul/docs/agent/config/cli-flags#_config_file) argument. ## Next steps -- If you see any verification errors, performance anomalies or other suspicious behavior from the target server during the test, you should immediately follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). Please report failure via GitHub. +- If you observe any verification errors, performance anomalies, or other suspicious behavior from the target server during the test, you should immediately follow [the procedure to revert back to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). Report failures through GitHub. - If you do not see errors and would like to expand the test further, you can repeat the above procedure on another target server. We suggest waiting after each test expansion and slowly rolling WAL out to other parts of your environment. Once the majority of your servers use WAL, any bugs not yet discovered may result in cluster unavailability. diff --git a/website/content/docs/agent/wal-logstore/monitoring.mdx b/website/content/docs/agent/wal-logstore/monitoring.mdx index ea63cf4c69a0..5be765cf408b 100644 --- a/website/content/docs/agent/wal-logstore/monitoring.mdx +++ b/website/content/docs/agent/wal-logstore/monitoring.mdx @@ -23,7 +23,7 @@ Log store verification failures on any server, regardless of whether you are run This indicates that the server read back data that is different from what it wrote to disk. This indicates corruption in the storage backend or filesystem. -For convenience, we also increment a metric `consul.raft.logstore.verifier.read_checksum_failures` when this occurs. +For convenience, Consul also increments a metric `consul.raft.logstore.verifier.read_checksum_failures` when this occurs. ### Write failures: In-flight Corruption @@ -35,11 +35,11 @@ The following error indicates that the checksum on the follower did not match th It is unlikely that this error indicates an issue with the storage backend, but you should take the same steps to resolve and report it. -The `consul.raft.logstore.verifier.write_checksum_failures` increments when this error occurs. +The `consul.raft.logstore.verifier.write_checksum_failures` metric increments when this error occurs. ## Resolve checksum failures -If either type of corruption is detected, complete the instructions for [reverting to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server already uses BoltDB, the errors likely indicate a latent bug in BoltDB or a bug in the verification code, but you should still follow the revert instructions. +If either type of corruption is detected, complete the instructions for [reverting to BoltDB](/consul/docs/agent/wal-logstore/revert-to-boltdb). If the server already uses BoltDB, the errors likely indicate a latent bug in BoltDB or a bug in the verification code. In both cases, you should follow the revert instructions. Report all verification failures as a [GitHub issue](https://github.com/hashicorp/consul/issues/new?assignees=&labels=&template=bug_report.md&title=WAL:%20Checksum%20Failure). @@ -48,10 +48,10 @@ In your report, include the following: - Details of your server cluster configuration and hardware - Logs around the failure message - Context for how long they have been running the configuration - - Any metrics or description of the workload you have, e.g. how many raft - commits per second as well as the performance metrics described below + - Any metrics or description of the workload you have. For example, how many raft + commits per second. Also include the performance metrics described on this page. -We recommend setting up an alert on Consul server logs containing `verification checksum FAILED` or on the `consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The sooner a corrupt server is handled, the lower the chance of any of the [potential risks](/consul/docs/agent/wal-logstore/enable#risks) causing problems in your cluster. +We recommend setting up an alert on Consul server logs containing `verification checksum FAILED` or on the `consul.raft.logstore.verifier.{read|write}_checksum_failures` metrics. The sooner you respond to a corrupt server, the lower the chance of any of the [potential risks](/consul/docs/agent/wal-logstore/enable#risks) causing problems in your cluster. ## Performance metrics @@ -60,8 +60,8 @@ The key performance metrics to watch are: - `consul.raft.commitTime` measures the time to commit new writes on a quorum of servers. It should be the same or lower after deploying WAL. Even if WAL is faster for your workload and hardware, it may not be reflected in `commitTime` - until enough followers are using WAL so that the leader does not have to wait for - a slower follower a cluster of three or two slower followers in a cluster of five to catch up. + until enough followers are using WAL that the leader does not have to wait for + two slower followers in a cluster of five to catch up. - `consul.raft.rpc.appendEntries.storeLogs` measures the time spent persisting logs to disk on each _follower_. It should be the same or lower for @@ -70,7 +70,7 @@ The key performance metrics to watch are: - `consul.raft.replication.appendEntries.rpc` measures the time taken for each `AppendEntries` RPC from the leader's perspective. If this is significantly higher than `consul.raft.rpc.appendEntries` on the follower, it indicates a - known queuing issue in our Raft library that and is unrelated to the backend. + known queuing issue in the Raft library and is unrelated to the backend. Followers with WAL enabled should not be slower than the others. You can determine which follower is associated with which metric by running the `consul operator raft list-peers` command and matching the diff --git a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx index 6760ca2a92f1..2bd0638b7cd3 100644 --- a/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx +++ b/website/content/docs/agent/wal-logstore/revert-to-boltdb.mdx @@ -29,7 +29,7 @@ If your environment uses configuration management automation that might interfer ## Remove data directory from target server -Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend doing this in case you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. +Temporarily moving the data directory to a different location is less destructive than deleting it. We recommend moving the data directory instead of deleted it in cases where you unsuccessfully enable WAL. Do not use the old data directory (`/data-dir/raft.bak`) for recovery after restarting the server. We recommend eventually deleting the old directory. The following example assumes the `data_dir` in the server's configuration is `/data-dir` and renames it to `/data-dir.wal.bak`. @@ -37,7 +37,7 @@ The following example assumes the `data_dir` in the server's configuration is `/ $ mv /data-dir/raft /data-dir/raft.wal.bak ``` -When switching backend, you must always remove the _whole raft directory_ not just the `raft.db` file or `wal` directory. This is because the log must always be consistent with the snapshots to avoid undefined behavior or data loss. +When switching backend, you must always remove _the entire raft directory_ not just the `raft.db` file or `wal` directory. This is because the log must always be consistent with the snapshots to avoid undefined behavior or data loss. ## Update target server's configuration @@ -69,7 +69,7 @@ $ consul operator raft list-peers ### Clean up old data directories -If necessary, cleanup any `raft.wal.bak` directories. Replace `/data-dir` with the value you have specified in your configuration file. +If necessary, clean up any `raft.wal.bak` directories. Replace `/data-dir` with the value you specified in your configuration file. ```shell-session $ rm /data-dir/raft.bak