From 6d189c5954653e8df4ac20e98974fa307e5c2efe Mon Sep 17 00:00:00 2001 From: Tobias Schottdorf Date: Wed, 10 Apr 2019 17:16:47 +0200 Subject: [PATCH] storage: prevent crash migrating from 19.1-beta into 19.1-rcX When I landed the change to stop sending the Raft log in snapshots, I gated this on whether the truncated state had already been unreplicated for the range. However, this wasn't enough because older 19.1 betas knew about unreplicated truncated state and yet couldn't handle a regressing truncated state, which sending these snapshots could introduce. As a result, 19.1-beta nodes could crash while running mixed with 19.1-rcX. (Simply restarting those nodes with the upgraded binary should fix the problem). This PR breaks one of our rules around not introducing historical cluster versions, but in this case it's necessary and also shouldn't have any adverse effects. See #36680. Release note (bug fix): prevent a crash that could occur when running a cluster mixed between 19.1-beta and 19.1-rcX nodes. The crash would manifest with a fatal error stating "TruncatedState regressed". Moving all nodes to the new binary (19.1-rcX or newer) rectifies this situation. This wouldn't affect anyone migrating directly from 2.1.x into 19.1.x, as the majority of our users are expected to. --- pkg/settings/cluster/cockroach_versions.go | 5 +++++ pkg/storage/replica_command.go | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pkg/settings/cluster/cockroach_versions.go b/pkg/settings/cluster/cockroach_versions.go index a781e655c6e6..008e9e5bce21 100644 --- a/pkg/settings/cluster/cockroach_versions.go +++ b/pkg/settings/cluster/cockroach_versions.go @@ -66,6 +66,7 @@ const ( VersionDirectImport VersionSideloadedStorageNoReplicaID // see versionsSingleton for details VersionPushTxnToInclusive + VersionSnapshotsWithoutLog Version19_1 // Add new versions here (step one of two). @@ -445,6 +446,10 @@ var versionsSingleton = keyedVersions([]keyedVersion{ Version: roachpb.Version{Major: 2, Minor: 1, Unstable: 10}, }, { + // VersionSnapshotsWithoutLog is https://github.com/cockroachdb/cockroach/pull/36714. + Key: VersionSnapshotsWithoutLog, + Version: roachpb.Version{Major: 2, Minor: 1, Unstable: 11}, + }, { // Version19_1 is CockroachDB v19.1. It's used for all v19.1.x patch releases. Key: Version19_1, Version: roachpb.Version{Major: 19, Minor: 1}, diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go index c5e17476e1de..824365f15b63 100644 --- a/pkg/storage/replica_command.go +++ b/pkg/storage/replica_command.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/storage/engine" "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/storage/storagepb" @@ -903,7 +904,11 @@ func (r *Replica) sendSnapshot( return errors.Wrap(err, "loading legacy truncated state") } - if !usesReplicatedTruncatedState && snap.State.TruncatedState.Index < snap.State.RaftAppliedIndex { + canAvoidSendingLog := !usesReplicatedTruncatedState && + snap.State.TruncatedState.Index < snap.State.RaftAppliedIndex && + r.store.ClusterSettings().Version.IsActive(cluster.VersionSnapshotsWithoutLog) + + if canAvoidSendingLog { // If we're not using a legacy (replicated) truncated state, we avoid // sending the (past) Raft log in the snapshot in the first place and // send only those entries that are actually useful to the follower.