Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

*: add "--unsafe-allow-cluster-version-downgrade" for not failing cluster version downgrade #13022

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG-3.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change.
- [`etcd --backend-bbolt-freelist-type`] flag is now stable.
- `etcd --experimental-backend-bbolt-freelist-type` has been deprecated.
- Support [downgrade API](https://github.com/etcd-io/etcd/pull/11715).
- Add [`etcd --unsafe-allow-cluster-version-downgrade`](https://github.com/etcd-io/etcd/pull/13022) for not failing cluster version downgrade.
- Deprecate v2 apply on cluster version. [Use v3 request to set cluster version and recover cluster version from v3 backend](https://github.com/etcd-io/etcd/pull/11427).
- [Use v2 api to update cluster version to support mixed version cluster during upgrade](https://github.com/etcd-io/etcd/pull/12988).
- [Fix corruption bug in defrag](https://github.com/etcd-io/etcd/pull/11613).
Expand Down
6 changes: 6 additions & 0 deletions server/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,12 @@ type ServerConfig struct {
// consider running defrag during bootstrap. Needs to be set to non-zero value to take effect.
ExperimentalBootstrapDefragThresholdMegabytes uint `json:"experimental-bootstrap-defrag-threshold-megabytes"`

// UnsafeAllowClusterVersionDowngrade is "true" to allow cluster version downgrade.
// "false" by default, since newer minor versions may introduce incompatible feature changes.
// For instance, lease checkpointer request to 3.4 will fail the remaining 3.3 nodes.
// But, if one does not use "lease checkpointer" feature, it can be safe to run 3.3 along with 3.4.
UnsafeAllowClusterVersionDowngrade bool `json:"unsafe-allow-cluster-version-downgrade"`

// V2Deprecation defines a phase of v2store deprecation process.
V2Deprecation V2DeprecationEnum `json:"v2-deprecation"`
}
Expand Down
12 changes: 12 additions & 0 deletions server/embed/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ const (
// v2 API is disabled by default.
DefaultEnableV2 = false

// DefaultUnsafeAllowClusterVersionDowngrade is the default value for "unsafe-allow-cluster-version-downgrade" flag.
// unsafe allow cluster version downgrade is disabled by default
DefaultUnsafeAllowClusterVersionDowngrade = false

// maxElectionMs specifies the maximum value of election timeout.
// More details are listed in ../Documentation/tuning.md#time-parameters.
maxElectionMs = 50000
Expand Down Expand Up @@ -392,6 +396,12 @@ type Config struct {
// ExperimentalTxnModeWriteWithSharedBuffer enables write transaction to use a shared buffer in its readonly check operations.
ExperimentalTxnModeWriteWithSharedBuffer bool `json:"experimental-txn-mode-write-with-shared-buffer"`

// UnsafeAllowClusterVersionDowngrade is "true" to allow cluster version downgrade.
// "false" by default, since newer minor versions may introduce incompatible feature changes.
// For instance, lease checkpointer request to 3.4 will fail the remaining 3.3 nodes.
// But, if one does not use "lease checkpointer" feature, it can be safe to run 3.3 along with 3.4.
UnsafeAllowClusterVersionDowngrade bool `json:"unsafe-allow-cluster-version-downgrade"`

// V2Deprecation describes phase of API & Storage V2 support
V2Deprecation config.V2DeprecationEnum `json:"v2-deprecation"`
}
Expand Down Expand Up @@ -489,6 +499,8 @@ func NewConfig() *Config {
ExperimentalMemoryMlock: false,
ExperimentalTxnModeWriteWithSharedBuffer: true,

UnsafeAllowClusterVersionDowngrade: DefaultUnsafeAllowClusterVersionDowngrade,

V2Deprecation: config.V2_DEPR_DEFAULT,
}
cfg.InitialCluster = cfg.InitialClusterFromName(cfg.Name)
Expand Down
3 changes: 2 additions & 1 deletion server/embed/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
ExperimentalMemoryMlock: cfg.ExperimentalMemoryMlock,
ExperimentalTxnModeWriteWithSharedBuffer: cfg.ExperimentalTxnModeWriteWithSharedBuffer,
ExperimentalBootstrapDefragThresholdMegabytes: cfg.ExperimentalBootstrapDefragThresholdMegabytes,
V2Deprecation: cfg.V2DeprecationEffective(),
UnsafeAllowClusterVersionDowngrade: cfg.UnsafeAllowClusterVersionDowngrade,
V2Deprecation: cfg.V2DeprecationEffective(),
}

if srvcfg.ExperimentalEnableDistributedTracing {
Expand Down
1 change: 1 addition & 0 deletions server/etcdmain/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ func newConfig() *config {

// unsafe
fs.BoolVar(&cfg.ec.UnsafeNoFsync, "unsafe-no-fsync", false, "Disables fsync, unsafe, will cause data loss.")
fs.BoolVar(&cfg.ec.UnsafeAllowClusterVersionDowngrade, "unsafe-allow-cluster-version-downgrade", embed.DefaultUnsafeAllowClusterVersionDowngrade, "true to allow cluster version downgrade, because newer minor versions may introduce incompatible feature changes like lease checkpointer introduced in v3.4")
fs.BoolVar(&cfg.ec.ForceNewCluster, "force-new-cluster", false, "Force to create a new one member cluster.")

// ignored
Expand Down
3 changes: 3 additions & 0 deletions server/etcdmain/help.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ Unsafe feature:
Force to create a new one-member cluster.
--unsafe-no-fsync 'false'
Disables fsync, unsafe, will cause data loss.
--unsafe-allow-cluster-version-downgrade 'false'
Allow cluster version downgrade, unsafe, newer minor versions may introduce incompatible feature changes.
For instance, experimental lease checkpointer is enabled in 3.4 and downgrade to 3.3 will fail.

CAUTIOUS with unsafe flag! It may break the guarantees given by the consensus protocol!
`
Expand Down
11 changes: 9 additions & 2 deletions server/etcdserver/api/membership/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ type RaftCluster struct {
v2store v2store.Store
be backend.Backend

// Readonly field after initialization
unsafeAllowDowngrade bool

sync.Mutex // guards the fields below
version *semver.Version
members map[types.ID]*Member
Expand Down Expand Up @@ -268,7 +271,7 @@ func (c *RaftCluster) Recover(onSet func(*zap.Logger, *semver.Version)) {
if c.downgradeInfo != nil {
d = &DowngradeInfo{Enabled: c.downgradeInfo.Enabled, TargetVersion: c.downgradeInfo.TargetVersion}
}
mustDetectDowngrade(c.lg, c.version, d)
mustDetectDowngrade(c.lg, c.version, d, c.unsafeAllowDowngrade)
onSet(c.lg, c.version)

for _, m := range c.members {
Expand Down Expand Up @@ -536,7 +539,7 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s
}
oldVer := c.version
c.version = ver
mustDetectDowngrade(c.lg, c.version, c.downgradeInfo)
mustDetectDowngrade(c.lg, c.version, c.downgradeInfo, c.unsafeAllowDowngrade)
if c.v2store != nil {
mustSaveClusterVersionToStore(c.lg, c.v2store, ver)
}
Expand All @@ -550,6 +553,10 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s
onSet(c.lg, ver)
}

func (c *RaftCluster) AllowUnsafeDowngrade() {
c.unsafeAllowDowngrade = true
}

func (c *RaftCluster) IsReadyToAddVotingMember() bool {
nmembers := 1
nstarted := 0
Expand Down
23 changes: 17 additions & 6 deletions server/etcdserver/api/membership/downgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func isValidDowngrade(verFrom *semver.Version, verTo *semver.Version) bool {
}

// mustDetectDowngrade will detect unexpected downgrade when the local server is recovered.
func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo) {
func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo, unsafeAllowDowngrade bool) {
lv := semver.Must(semver.NewVersion(version.Version))
// only keep major.minor version for comparison against cluster version
lv = &semver.Version{Major: lv.Major, Minor: lv.Minor}
Expand All @@ -63,14 +63,25 @@ func mustDetectDowngrade(lg *zap.Logger, cv *semver.Version, d *DowngradeInfo) {
)
}

// if downgrade is enabled, and it's one minor version down
// safe to not fail (e.g., local version 3.4, cluster version 3.5)
// if the cluster disables downgrade, check local version against determined cluster version.
// the validation passes when local version is not less than cluster version
if cv != nil && lv.LessThan(*cv) {
lg.Fatal(
"invalid downgrade; server version is lower than determined cluster version",
zap.String("current-server-version", version.Version),
zap.String("determined-cluster-version", version.Cluster(cv.String())),
)
if unsafeAllowDowngrade && isValidDowngrade(cv, lv) {
lg.Warn("allowing unsafe downgrade; local server version is lower than determined cluster version",
zap.String("current-server-version", version.Version),
zap.String("determined-cluster-version", version.Cluster(cv.String())),
zap.String("target-cluster-version", version.Cluster(lv.String())),
)
// overwrite the cluster version with local version determined by the etcd binary version
*cv = *lv
} else {
lg.Fatal("invalid downgrade, not allowed; local server version is lower than determined cluster version",
zap.String("current-server-version", version.Version),
zap.String("determined-cluster-version", version.Cluster(cv.String())),
)
}
}
}

Expand Down
33 changes: 25 additions & 8 deletions server/etcdserver/api/membership/downgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,51 +40,66 @@ func TestMustDetectDowngrade(t *testing.T) {
downgradeDisabled := &DowngradeInfo{Enabled: false}

tests := []struct {
name string
clusterVersion *semver.Version
downgrade *DowngradeInfo
success bool
message string
name string
clusterVersion *semver.Version
downgrade *DowngradeInfo
unsafeAllowDowngrade bool
success bool
message string
}{
{
"Succeeded when downgrade is disabled and cluster version is nil",
nil,
downgradeDisabled,
false,
true,
"",
},
{
"Succeeded when downgrade is disabled and cluster version is one minor lower",
oneMinorLower,
downgradeDisabled,
false,
true,
"",
},
{
"Succeeded when downgrade is disabled and cluster version is server version",
lv,
downgradeDisabled,
false,
true,
"",
},
{
"Succeed when downgrade is disabled, unsafeDowngrade is enabled and cluster version is one minor higher",
oneMinorHigher,
downgradeDisabled,
true,
true,
"allowing unsafe downgrade; local server version is lower than determined cluster version",
},
{
"Failed when downgrade is disabled and server version is lower than determined cluster version ",
oneMinorHigher,
downgradeDisabled,
false,
"invalid downgrade; server version is lower than determined cluster version",
false,
"invalid downgrade, not allowed; local server version is lower than determined cluster version",
},
{
"Succeeded when downgrade is enabled and cluster version is nil",
nil,
downgradeEnabledEqualVersion,
false,
true,
"",
},
{
"Failed when downgrade is enabled and server version is target version",
lv,
downgradeEnabledEqualVersion,
false,
true,
"cluster is downgrading to target version",
},
Expand All @@ -93,21 +108,23 @@ func TestMustDetectDowngrade(t *testing.T) {
lv,
downgradeEnabledLowerVersion,
false,
false,
"invalid downgrade; server version is not allowed to join when downgrade is enabled",
},
{
"Failed when downgrade is enabled and local version is out of range and cluster version is nil",
nil,
downgradeEnabledHigherVersion,
false,
false,
"invalid downgrade; server version is not allowed to join when downgrade is enabled",
},

{
"Failed when downgrade is enabled and local version is out of range",
lv,
downgradeEnabledHigherVersion,
false,
false,
"invalid downgrade; server version is not allowed to join when downgrade is enabled",
},
}
Expand All @@ -122,7 +139,7 @@ func TestMustDetectDowngrade(t *testing.T) {
lcfg.ErrorOutputPaths = []string{logPath}
lg, _ := lcfg.Build()

mustDetectDowngrade(lg, tests[iint].clusterVersion, tests[iint].downgrade)
mustDetectDowngrade(lg, tests[iint].clusterVersion, tests[iint].downgrade, tests[iint].unsafeAllowDowngrade)
return
}

Expand Down
21 changes: 15 additions & 6 deletions server/etcdserver/cluster_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,10 @@ func decideClusterVersion(lg *zap.Logger, vers map[string]*version.Versions) *se
}

// allowedVersionRange decides the available version range of the cluster that local server can join in;
// if the downgrade enabled status is true, the version window is [oneMinorHigher, oneMinorHigher]
// if the downgrade is not enabled, the version window is [MinClusterVersion, localVersion]
func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *semver.Version) {
// if the downgrade enabled status is true, the version window is [oneMinorHigherThanLocalVersion, oneMinorHigherThanLocalVersion]
// otherwise, if the unsafeDowngrade enabled status is true, the version window is [MinClusterVersion, oneMinorHigherThanLocalVersion],
// if the both downgrade and unsafeDowngrade is not enabled, the version window is [MinClusterVersion, localVersion]
func allowedVersionRange(downgradeEnabled bool, unsafeDowngradeEnabled bool) (minV *semver.Version, maxV *semver.Version) {
minV = semver.Must(semver.NewVersion(version.MinClusterVersion))
maxV = semver.Must(semver.NewVersion(version.Version))
maxV = &semver.Version{Major: maxV.Major, Minor: maxV.Minor}
Expand All @@ -211,7 +212,14 @@ func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *sem
// Todo: handle the case that downgrading from higher major version(e.g. downgrade from v4.0 to v3.x)
maxV.Minor = maxV.Minor + 1
minV = &semver.Version{Major: maxV.Major, Minor: maxV.Minor}
return minV, maxV
}

// if unsafeDowngrade is enabled, allow one minor version down
if unsafeDowngradeEnabled {
maxV.Minor = maxV.Minor + 1
}

return minV, maxV
}

Expand All @@ -221,9 +229,9 @@ func allowedVersionRange(downgradeEnabled bool) (minV *semver.Version, maxV *sem
// cluster version in the range of [MinV, MaxV] and no known members has a cluster version
// out of the range.
// We set this rule since when the local member joins, another member might be offline.
func isCompatibleWithCluster(lg *zap.Logger, cl *membership.RaftCluster, local types.ID, rt http.RoundTripper) bool {
func isCompatibleWithCluster(lg *zap.Logger, cl *membership.RaftCluster, local types.ID, rt http.RoundTripper, unsafeAllowClusterVersionDowngrade bool) bool {
vers := getVersions(lg, cl, local, rt)
minV, maxV := allowedVersionRange(getDowngradeEnabledFromRemotePeers(lg, cl, local, rt))
minV, maxV := allowedVersionRange(getDowngradeEnabledFromRemotePeers(lg, cl, local, rt), unsafeAllowClusterVersionDowngrade)
return isCompatibleWithVers(lg, vers, local, minV, maxV)
}

Expand Down Expand Up @@ -256,12 +264,13 @@ func isCompatibleWithVers(lg *zap.Logger, vers map[string]*version.Versions, loc
)
return false
}

if maxV.LessThan(*clusterv) {
lg.Warn(
"cluster version of remote member is not compatible; too high",
zap.String("remote-member-id", id),
zap.String("remote-member-cluster-version", clusterv.String()),
zap.String("minimum-cluster-version-supported", minV.String()),
zap.String("maximum-cluster-version-supported", maxV.String()),
)
return false
}
Expand Down
27 changes: 22 additions & 5 deletions server/etcdserver/cluster_util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,28 +183,45 @@ func TestDecideAllowedVersionRange(t *testing.T) {
localV = &semver.Version{Major: localV.Major, Minor: localV.Minor}

tests := []struct {
name string
downgradeEnabled bool
expectedMinV *semver.Version
expectedMaxV *semver.Version
name string
downgradeEnabled bool
unsafeDowngradeEnabled bool
expectedMinV *semver.Version
expectedMaxV *semver.Version
}{
{
"When cluster enables downgrade",
true,
false,
&semver.Version{Major: localV.Major, Minor: localV.Minor + 1},
&semver.Version{Major: localV.Major, Minor: localV.Minor + 1},
},
{
"When cluster enables downgrade and unsafeDowngrade",
true,
true,
&semver.Version{Major: localV.Major, Minor: localV.Minor + 1},
&semver.Version{Major: localV.Major, Minor: localV.Minor + 1},
},
{
"When cluster disables downgrade and enables unsafeDowngrade",
false,
true,
minClusterV,
&semver.Version{Major: localV.Major, Minor: localV.Minor + 1},
},
{
"When cluster disables downgrade",
false,
false,
minClusterV,
localV,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
minV, maxV := allowedVersionRange(tt.downgradeEnabled)
minV, maxV := allowedVersionRange(tt.downgradeEnabled, tt.unsafeDowngradeEnabled)
if !minV.Equal(*tt.expectedMinV) {
t.Errorf("Expected minV is %v; Got %v", tt.expectedMinV.String(), minV.String())
}
Expand Down
6 changes: 6 additions & 0 deletions server/etcdserver/raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,9 @@ func restartNode(cfg config.ServerConfig, snapshot *raftpb.Snapshot) (types.ID,
)
cl := membership.NewCluster(cfg.Logger)
cl.SetID(id, cid)
if cfg.UnsafeAllowClusterVersionDowngrade {
cl.AllowUnsafeDowngrade()
}
s := raft.NewMemoryStorage()
if snapshot != nil {
s.ApplySnapshot(*snapshot)
Expand Down Expand Up @@ -562,6 +565,9 @@ func restartAsStandaloneNode(cfg config.ServerConfig, snapshot *raftpb.Snapshot)

cl := membership.NewCluster(cfg.Logger)
cl.SetID(id, cid)
if cfg.UnsafeAllowClusterVersionDowngrade {
cl.AllowUnsafeDowngrade()
}
s := raft.NewMemoryStorage()
if snapshot != nil {
s.ApplySnapshot(*snapshot)
Expand Down
Loading