From 3d4499ae5344a2e9ac635e8b8b1f6217a059c6a1 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 14 May 2021 17:43:51 -0700 Subject: [PATCH 1/2] add v3.6 to v3.5 downgrade support automatically --- server/config/config.go | 6 ++++ server/embed/config.go | 12 +++++++ server/embed/etcd.go | 3 +- server/etcdmain/config.go | 1 + server/etcdmain/help.go | 3 ++ server/etcdserver/api/membership/cluster.go | 11 +++++-- server/etcdserver/api/membership/downgrade.go | 23 +++++++++---- .../api/membership/downgrade_test.go | 33 ++++++++++++++----- server/etcdserver/cluster_util.go | 21 ++++++++---- server/etcdserver/cluster_util_test.go | 27 ++++++++++++--- server/etcdserver/raft.go | 6 ++++ server/etcdserver/server.go | 8 ++++- 12 files changed, 125 insertions(+), 29 deletions(-) diff --git a/server/config/config.go b/server/config/config.go index b6e2109c228..01e8fa5f6f8 100644 --- a/server/config/config.go +++ b/server/config/config.go @@ -183,6 +183,12 @@ type ServerConfig struct { // consider running defrag during bootstrap. Needs to be set to non-zero value to take effect. ExperimentalBootstrapDefragThresholdMegabytes uint `json:"experimental-bootstrap-defrag-threshold-megabytes"` + // UnsafeAllowClusterVersionDowngrade is "true" to allow cluster version downgrade. + // "false" by default, since newer minor versions may introduce incompatible feature changes. + // For instance, lease checkpointer request to 3.4 will fail the remaining 3.3 nodes. + // But, if one does not use "lease checkpointer" feature, it can be safe to run 3.3 along with 3.4. + UnsafeAllowClusterVersionDowngrade bool `json:"unsafe-allow-cluster-version-downgrade"` + // V2Deprecation defines a phase of v2store deprecation process. V2Deprecation V2DeprecationEnum `json:"v2-deprecation"` } diff --git a/server/embed/config.go b/server/embed/config.go index 380c0c3aaa8..d29d360b139 100644 --- a/server/embed/config.go +++ b/server/embed/config.go @@ -89,6 +89,10 @@ const ( // v2 API is disabled by default. DefaultEnableV2 = false + // DefaultUnsafeAllowClusterVersionDowngrade is the default value for "unsafe-allow-cluster-version-downgrade" flag. + // unsafe allow cluster version downgrade is disabled by default + DefaultUnsafeAllowClusterVersionDowngrade = false + // maxElectionMs specifies the maximum value of election timeout. // More details are listed in ../Documentation/tuning.md#time-parameters. maxElectionMs = 50000 @@ -392,6 +396,12 @@ type Config struct { // ExperimentalTxnModeWriteWithSharedBuffer enables write transaction to use a shared buffer in its readonly check operations. ExperimentalTxnModeWriteWithSharedBuffer bool `json:"experimental-txn-mode-write-with-shared-buffer"` + // UnsafeAllowClusterVersionDowngrade is "true" to allow cluster version downgrade. + // "false" by default, since newer minor versions may introduce incompatible feature changes. + // For instance, lease checkpointer request to 3.4 will fail the remaining 3.3 nodes. + // But, if one does not use "lease checkpointer" feature, it can be safe to run 3.3 along with 3.4. + UnsafeAllowClusterVersionDowngrade bool `json:"unsafe-allow-cluster-version-downgrade"` + // V2Deprecation describes phase of API & Storage V2 support V2Deprecation config.V2DeprecationEnum `json:"v2-deprecation"` } @@ -489,6 +499,8 @@ func NewConfig() *Config { ExperimentalMemoryMlock: false, ExperimentalTxnModeWriteWithSharedBuffer: true, + UnsafeAllowClusterVersionDowngrade: DefaultUnsafeAllowClusterVersionDowngrade, + V2Deprecation: config.V2_DEPR_DEFAULT, } cfg.InitialCluster = cfg.InitialClusterFromName(cfg.Name) diff --git a/server/embed/etcd.go b/server/embed/etcd.go index 001302f991b..48f79d389a9 100644 --- a/server/embed/etcd.go +++ b/server/embed/etcd.go @@ -223,7 +223,8 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) { ExperimentalMemoryMlock: cfg.ExperimentalMemoryMlock, ExperimentalTxnModeWriteWithSharedBuffer: cfg.ExperimentalTxnModeWriteWithSharedBuffer, ExperimentalBootstrapDefragThresholdMegabytes: cfg.ExperimentalBootstrapDefragThresholdMegabytes, - V2Deprecation: cfg.V2DeprecationEffective(), + UnsafeAllowClusterVersionDowngrade: cfg.UnsafeAllowClusterVersionDowngrade, + V2Deprecation: cfg.V2DeprecationEffective(), } if srvcfg.ExperimentalEnableDistributedTracing { diff --git a/server/etcdmain/config.go b/server/etcdmain/config.go index f9c91d9f9c5..d3c9c532ba8 100644 --- a/server/etcdmain/config.go +++ b/server/etcdmain/config.go @@ -291,6 +291,7 @@ func newConfig() *config { // unsafe fs.BoolVar(&cfg.ec.UnsafeNoFsync, "unsafe-no-fsync", false, "Disables fsync, unsafe, will cause data loss.") + fs.BoolVar(&cfg.ec.UnsafeAllowClusterVersionDowngrade, "unsafe-allow-cluster-version-downgrade", embed.DefaultUnsafeAllowClusterVersionDowngrade, "true to allow cluster version downgrade, because newer minor versions may introduce incompatible feature changes like lease checkpointer introduced in v3.4") fs.BoolVar(&cfg.ec.ForceNewCluster, "force-new-cluster", false, "Force to create a new one member cluster.") // ignored diff --git a/server/etcdmain/help.go b/server/etcdmain/help.go index dc5b55fae7e..5c39e8ebab7 100644 --- a/server/etcdmain/help.go +++ b/server/etcdmain/help.go @@ -252,6 +252,9 @@ Unsafe feature: Force to create a new one-member cluster. --unsafe-no-fsync 'false' Disables fsync, unsafe, will cause data loss. + --unsafe-allow-cluster-version-downgrade 'false' + Allow cluster version downgrade, unsafe, newer minor versions may introduce incompatible feature changes. + For instance, experimental lease checkpointer is enabled in 3.4 and downgrade to 3.3 will fail. CAUTIOUS with unsafe flag! It may break the guarantees given by the consensus protocol! ` diff --git a/server/etcdserver/api/membership/cluster.go b/server/etcdserver/api/membership/cluster.go index 3187d12f7d1..6517c0fe1aa 100644 --- a/server/etcdserver/api/membership/cluster.go +++ b/server/etcdserver/api/membership/cluster.go @@ -52,6 +52,9 @@ type RaftCluster struct { v2store v2store.Store be backend.Backend + // Readonly field after initialization + unsafeAllowDowngrade bool + sync.Mutex // guards the fields below version *semver.Version members map[types.ID]*Member @@ -268,7 +271,7 @@ func (c *RaftCluster) Recover(onSet func(*zap.Logger, *semver.Version)) { if c.downgradeInfo != nil { d = &DowngradeInfo{Enabled: c.downgradeInfo.Enabled, TargetVersion: c.downgradeInfo.TargetVersion} } - mustDetectDowngrade(c.lg, c.version, d) + mustDetectDowngrade(c.lg, c.version, d, c.unsafeAllowDowngrade) onSet(c.lg, c.version) for _, m := range c.members { @@ -536,7 +539,7 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s } oldVer := c.version c.version = ver - mustDetectDowngrade(c.lg, c.version, c.downgradeInfo) + mustDetectDowngrade(c.lg, c.version, c.downgradeInfo, c.unsafeAllowDowngrade) if c.v2store != nil { mustSaveClusterVersionToStore(c.lg, c.v2store, ver) } @@ -550,6 +553,10 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s onSet(c.lg, ver) } +func (c *RaftCluster) AllowUnsafeDowngrade() { + c.unsafeAllowDowngrade = true +} + func (c *RaftCluster) IsReadyToAddVotingMember() bool { nmembers := 1 nstarted := 0 diff --git a/server/etcdserver/api/membership/downgrade.go b/server/etcdserver/api/membership/downgrade.go index 9fdafe22aae..88a07f8d16c 100644 --- a/server/etcdserver/api/membership/downgrade.go +++ b/server/etcdserver/api/membership/downgrade.go @@ -38,7 +38,7 @@ func isValidDowngrade(verFrom *semver.Version, verTo *semver.Version) bool { } // mustDetectDowngrade will detect unexpected downgrade when the local server is recovered. -func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo) { +func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo, unsafeAllowDowngrade bool) { lv := semver.Must(semver.NewVersion(version.Version)) // only keep major.minor version for comparison against cluster version lv = &semver.Version{Major: lv.Major, Minor: lv.Minor} @@ -63,14 +63,25 @@ func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo) { ) } + // if downgrade is enabled, and it's one minor version down + // safe to not fail (e.g., local version 3.4, cluster version 3.5) // if the cluster disables downgrade, check local version against determined cluster version. // the validation passes when local version is not less than cluster version if cv != nil && lv.LessThan(*cv) { - lg.Fatal( - "invalid downgrade; server version is lower than determined cluster version", - zap.String("current-server-version", version.Version), - zap.String("determined-cluster-version", version.Cluster(cv.String())), - ) + if unsafeAllowDowngrade && isValidDowngrade(cv, lv) { + lg.Warn("allowing unsafe downgrade; local server version is lower than determined cluster version", + zap.String("current-server-version", version.Version), + zap.String("determined-cluster-version", version.Cluster(cv.String())), + zap.String("target-cluster-version", version.Cluster(lv.String())), + ) + // overwrite the cluster version with local version determined by the etcd binary version + *cv = *lv + } else { + lg.Fatal("invalid downgrade, not allowed; local server version is lower than determined cluster version", + zap.String("current-server-version", version.Version), + zap.String("determined-cluster-version", version.Cluster(cv.String())), + ) + } } } diff --git a/server/etcdserver/api/membership/downgrade_test.go b/server/etcdserver/api/membership/downgrade_test.go index 8bb612d3540..a29f698f837 100644 --- a/server/etcdserver/api/membership/downgrade_test.go +++ b/server/etcdserver/api/membership/downgrade_test.go @@ -40,16 +40,18 @@ func TestMustDetectDowngrade(t *testing.T) { downgradeDisabled := &DowngradeInfo{Enabled: false} tests := []struct { - name string - clusterVersion *semver.Version - downgrade *DowngradeInfo - success bool - message string + name string + clusterVersion *semver.Version + downgrade *DowngradeInfo + unsafeAllowDowngrade bool + success bool + message string }{ { "Succeeded when downgrade is disabled and cluster version is nil", nil, downgradeDisabled, + false, true, "", }, @@ -57,6 +59,7 @@ func TestMustDetectDowngrade(t *testing.T) { "Succeeded when downgrade is disabled and cluster version is one minor lower", oneMinorLower, downgradeDisabled, + false, true, "", }, @@ -64,20 +67,31 @@ func TestMustDetectDowngrade(t *testing.T) { "Succeeded when downgrade is disabled and cluster version is server version", lv, downgradeDisabled, + false, true, "", }, + { + "Succeed when downgrade is disabled, unsafeDowngrade is enabled and cluster version is one minor higher", + oneMinorHigher, + downgradeDisabled, + true, + true, + "allowing unsafe downgrade; local server version is lower than determined cluster version", + }, { "Failed when downgrade is disabled and server version is lower than determined cluster version ", oneMinorHigher, downgradeDisabled, false, - "invalid downgrade; server version is lower than determined cluster version", + false, + "invalid downgrade, not allowed; local server version is lower than determined cluster version", }, { "Succeeded when downgrade is enabled and cluster version is nil", nil, downgradeEnabledEqualVersion, + false, true, "", }, @@ -85,6 +99,7 @@ func TestMustDetectDowngrade(t *testing.T) { "Failed when downgrade is enabled and server version is target version", lv, downgradeEnabledEqualVersion, + false, true, "cluster is downgrading to target version", }, @@ -93,6 +108,7 @@ func TestMustDetectDowngrade(t *testing.T) { lv, downgradeEnabledLowerVersion, false, + false, "invalid downgrade; server version is not allowed to join when downgrade is enabled", }, { @@ -100,14 +116,15 @@ func TestMustDetectDowngrade(t *testing.T) { nil, downgradeEnabledHigherVersion, false, + false, "invalid downgrade; server version is not allowed to join when downgrade is enabled", }, - { "Failed when downgrade is enabled and local version is out of range", lv, downgradeEnabledHigherVersion, false, + false, "invalid downgrade; server version is not allowed to join when downgrade is enabled", }, } @@ -122,7 +139,7 @@ func TestMustDetectDowngrade(t *testing.T) { lcfg.ErrorOutputPaths = []string{logPath} lg, _ := lcfg.Build() - mustDetectDowngrade(lg, tests[iint].clusterVersion, tests[iint].downgrade) + mustDetectDowngrade(lg, tests[iint].clusterVersion, tests[iint].downgrade, tests[iint].unsafeAllowDowngrade) return } diff --git a/server/etcdserver/cluster_util.go b/server/etcdserver/cluster_util.go index 595586e2012..5320191c112 100644 --- a/server/etcdserver/cluster_util.go +++ b/server/etcdserver/cluster_util.go @@ -200,9 +200,10 @@ func decideClusterVersion(lg *zap.Logger, vers map[string]*version.Versions) *se } // allowedVersionRange decides the available version range of the cluster that local server can join in; -// if the downgrade enabled status is true, the version window is [oneMinorHigher, oneMinorHigher] -// if the downgrade is not enabled, the version window is [MinClusterVersion, localVersion] -func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *semver.Version) { +// if the downgrade enabled status is true, the version window is [oneMinorHigherThanLocalVersion, oneMinorHigherThanLocalVersion] +// otherwise, if the unsafeDowngrade enabled status is true, the version window is [MinClusterVersion, oneMinorHigherThanLocalVersion], +// if the both downgrade and unsafeDowngrade is not enabled, the version window is [MinClusterVersion, localVersion] +func allowedVersionRange(downgradeEnabled bool, unsafeDowngradeEnabled bool) (minV *semver.Version, maxV *semver.Version) { minV = semver.Must(semver.NewVersion(version.MinClusterVersion)) maxV = semver.Must(semver.NewVersion(version.Version)) maxV = &semver.Version{Major: maxV.Major, Minor: maxV.Minor} @@ -211,7 +212,14 @@ func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *sem // Todo: handle the case that downgrading from higher major version(e.g. downgrade from v4.0 to v3.x) maxV.Minor = maxV.Minor + 1 minV = &semver.Version{Major: maxV.Major, Minor: maxV.Minor} + return minV, maxV } + + // if unsafeDowngrade is enabled, allow one minor version down + if unsafeDowngradeEnabled { + maxV.Minor = maxV.Minor + 1 + } + return minV, maxV } @@ -221,9 +229,9 @@ func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *sem // cluster version in the range of [MinV, MaxV] and no known members has a cluster version // out of the range. // We set this rule since when the local member joins, another member might be offline. -func isCompatibleWithCluster(lg *zap.Logger, cl *membership.RaftCluster, local types.ID, rt http.RoundTripper) bool { +func isCompatibleWithCluster(lg *zap.Logger, cl *membership.RaftCluster, local types.ID, rt http.RoundTripper, unsafeAllowClusterVersionDowngrade bool) bool { vers := getVersions(lg, cl, local, rt) - minV, maxV := allowedVersionRange(getDowngradeEnabledFromRemotePeers(lg, cl, local, rt)) + minV, maxV := allowedVersionRange(getDowngradeEnabledFromRemotePeers(lg, cl, local, rt), unsafeAllowClusterVersionDowngrade) return isCompatibleWithVers(lg, vers, local, minV, maxV) } @@ -256,12 +264,13 @@ func isCompatibleWithVers(lg *zap.Logger, vers map[string]*version.Versions, loc ) return false } + if maxV.LessThan(*clusterv) { lg.Warn( "cluster version of remote member is not compatible; too high", zap.String("remote-member-id", id), zap.String("remote-member-cluster-version", clusterv.String()), - zap.String("minimum-cluster-version-supported", minV.String()), + zap.String("maximum-cluster-version-supported", maxV.String()), ) return false } diff --git a/server/etcdserver/cluster_util_test.go b/server/etcdserver/cluster_util_test.go index f2196b84dc3..a28d0aa411a 100644 --- a/server/etcdserver/cluster_util_test.go +++ b/server/etcdserver/cluster_util_test.go @@ -183,20 +183,37 @@ func TestDecideAllowedVersionRange(t *testing.T) { localV = &semver.Version{Major: localV.Major, Minor: localV.Minor} tests := []struct { - name string - downgradeEnabled bool - expectedMinV *semver.Version - expectedMaxV *semver.Version + name string + downgradeEnabled bool + unsafeDowngradeEnabled bool + expectedMinV *semver.Version + expectedMaxV *semver.Version }{ { "When cluster enables downgrade", true, + false, + &semver.Version{Major: localV.Major, Minor: localV.Minor + 1}, + &semver.Version{Major: localV.Major, Minor: localV.Minor + 1}, + }, + { + "When cluster enables downgrade and unsafeDowngrade", + true, + true, &semver.Version{Major: localV.Major, Minor: localV.Minor + 1}, &semver.Version{Major: localV.Major, Minor: localV.Minor + 1}, }, + { + "When cluster disables downgrade and enables unsafeDowngrade", + false, + true, + minClusterV, + &semver.Version{Major: localV.Major, Minor: localV.Minor + 1}, + }, { "When cluster disables downgrade", false, + false, minClusterV, localV, }, @@ -204,7 +221,7 @@ func TestDecideAllowedVersionRange(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - minV, maxV := allowedVersionRange(tt.downgradeEnabled) + minV, maxV := allowedVersionRange(tt.downgradeEnabled, tt.unsafeDowngradeEnabled) if !minV.Equal(*tt.expectedMinV) { t.Errorf("Expected minV is %v; Got %v", tt.expectedMinV.String(), minV.String()) } diff --git a/server/etcdserver/raft.go b/server/etcdserver/raft.go index 8b9600d39ce..41b8699c532 100644 --- a/server/etcdserver/raft.go +++ b/server/etcdserver/raft.go @@ -488,6 +488,9 @@ func restartNode(cfg config.ServerConfig, snapshot *raftpb.Snapshot) (types.ID, ) cl := membership.NewCluster(cfg.Logger) cl.SetID(id, cid) + if cfg.UnsafeAllowClusterVersionDowngrade { + cl.AllowUnsafeDowngrade() + } s := raft.NewMemoryStorage() if snapshot != nil { s.ApplySnapshot(*snapshot) @@ -562,6 +565,9 @@ func restartAsStandaloneNode(cfg config.ServerConfig, snapshot *raftpb.Snapshot) cl := membership.NewCluster(cfg.Logger) cl.SetID(id, cid) + if cfg.UnsafeAllowClusterVersionDowngrade { + cl.AllowUnsafeDowngrade() + } s := raft.NewMemoryStorage() if snapshot != nil { s.ApplySnapshot(*snapshot) diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 2c8855ec33d..2e04c2cc81b 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -421,7 +421,7 @@ func NewServer(cfg config.ServerConfig) (srv *EtcdServer, err error) { if err = membership.ValidateClusterAndAssignIDs(cfg.Logger, cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } - if !isCompatibleWithCluster(cfg.Logger, cl, cl.MemberByName(cfg.Name).ID, prt) { + if !isCompatibleWithCluster(cfg.Logger, cl, cl.MemberByName(cfg.Name).ID, prt, cfg.UnsafeAllowClusterVersionDowngrade) { return nil, fmt.Errorf("incompatible with current running cluster") } @@ -429,6 +429,9 @@ func NewServer(cfg config.ServerConfig) (srv *EtcdServer, err error) { cl.SetID(types.ID(0), existingCluster.ID()) cl.SetStore(st) cl.SetBackend(be) + if cfg.UnsafeAllowClusterVersionDowngrade { + cl.AllowUnsafeDowngrade() + } id, n, s, w = startNode(cfg, cl, nil) cl.SetID(id, existingCluster.ID()) @@ -464,6 +467,9 @@ func NewServer(cfg config.ServerConfig) (srv *EtcdServer, err error) { } cl.SetStore(st) cl.SetBackend(be) + if cfg.UnsafeAllowClusterVersionDowngrade { + cl.AllowUnsafeDowngrade() + } id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) cl.SetID(id, cl.ID()) From 94f99362584ae0c8ca9bed2c56a0c35c87b6449f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 8 Jun 2021 17:26:46 -0700 Subject: [PATCH 2/2] CHANGELOG-3.5.md: update --- CHANGELOG-3.5.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG-3.5.md b/CHANGELOG-3.5.md index e40a847d091..942a0cb8f15 100644 --- a/CHANGELOG-3.5.md +++ b/CHANGELOG-3.5.md @@ -116,6 +116,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change. - [`etcd --backend-bbolt-freelist-type`] flag is now stable. - `etcd --experimental-backend-bbolt-freelist-type` has been deprecated. - Support [downgrade API](https://github.com/etcd-io/etcd/pull/11715). +- Add [`etcd --unsafe-allow-cluster-version-downgrade`](https://github.com/etcd-io/etcd/pull/13022) for not failing cluster version downgrade. - Deprecate v2 apply on cluster version. [Use v3 request to set cluster version and recover cluster version from v3 backend](https://github.com/etcd-io/etcd/pull/11427). - [Use v2 api to update cluster version to support mixed version cluster during upgrade](https://github.com/etcd-io/etcd/pull/12988). - [Fix corruption bug in defrag](https://github.com/etcd-io/etcd/pull/11613).