Skip to content

Commit

Permalink
core: store and check for Raft version changes (#12362)
Browse files Browse the repository at this point in the history
Downgrading the Raft version protocol is not a supported operation.
Checking for a downgrade is hard since this information is not stored in
any persistent place. When a server re-joins a cluster with a prior Raft
version, the Serf tag is updated so Nomad can't tell that the version
changed.

Mixed version clusters must be supported to allow for zero-downtime
rolling upgrades. During this it's expected that the cluster will have
mixed Raft versions. Enforcing consistency strong version consistency
would disrupt this flow.

The approach taken here is to store the Raft version on disk. When the
server starts the `raft_protocol` value is written to the file
`data_dir/raft/version`. If that file already exists, its content is
checked against the current `raft_protocol` value to detect downgrades
and prevent the server from starting.

Any other types of errors are ignore to prevent disruptions that are
outside the control of operators. The only option in cases of an invalid
or corrupt file would be to delete it, making this check useless. So
just overwrite its content with the new version and provide guidance on
how to check that their cluster is an expected state.
  • Loading branch information
lgfa29 committed Mar 24, 2022
1 parent a7e5df8 commit 0783ac6
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .changelog/12362.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
server: store and check previous Raft protocol version to prevent downgrades
```
46 changes: 46 additions & 0 deletions nomad/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1284,6 +1284,16 @@ func (s *Server) setupRaft() error {
return err
}

// Check Raft version and update the version file.
raftVersionFilePath := filepath.Join(path, "version")
raftVersionFileContent := strconv.Itoa(int(s.config.RaftConfig.ProtocolVersion))
if err := s.checkRaftVersionFile(raftVersionFilePath); err != nil {
return err
}
if err := ioutil.WriteFile(raftVersionFilePath, []byte(raftVersionFileContent), 0644); err != nil {
return fmt.Errorf("failed to write Raft version file: %v", err)
}

// Create the BoltDB backend, with NoFreelistSync option
store, raftErr := raftboltdb.New(raftboltdb.Options{
Path: filepath.Join(path, "raft.db"),
Expand Down Expand Up @@ -1399,6 +1409,42 @@ func (s *Server) setupRaft() error {
return nil
}

// checkRaftVersionFile reads the Raft version file and returns an error if
// the Raft version is incompatible with the current version configured.
// Provide best-effort check if the file cannot be read.
func (s *Server) checkRaftVersionFile(path string) error {
raftVersion := s.config.RaftConfig.ProtocolVersion
baseWarning := "use the 'nomad operator raft list-peers' command to make sure the Raft protocol versions are consistent"

_, err := os.Stat(path)
if err != nil {
if os.IsNotExist(err) {
return nil
}

s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err)
return nil
}

v, err := ioutil.ReadFile(path)
if err != nil {
s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err)
return nil
}

previousVersion, err := strconv.Atoi(strings.TrimSpace(string(v)))
if err != nil {
s.logger.Warn(fmt.Sprintf("invalid Raft protocol version in Raft version file, %s", baseWarning), "error", err)
return nil
}

if raft.ProtocolVersion(previousVersion) > raftVersion {
return fmt.Errorf("downgrading Raft is not supported, current version is %d, previous version was %d", raftVersion, previousVersion)
}

return nil
}

// setupSerf is used to setup and initialize a Serf
func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) {
conf.Init()
Expand Down
24 changes: 24 additions & 0 deletions nomad/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -645,3 +645,27 @@ func TestServer_ReloadSchedulers_InvalidSchedulers(t *testing.T) {
currentWC = s.GetSchedulerWorkerConfig()
require.Equal(t, origWC, currentWC)
}

func TestServer_PreventRaftDowngrade(t *testing.T) {
ci.Parallel(t)

dir := t.TempDir()
_, cleanupv3 := TestServer(t, func(c *Config) {
c.DevMode = false
c.DataDir = dir
c.RaftConfig.ProtocolVersion = 3
})
cleanupv3()

_, cleanupv2, err := TestServerErr(t, func(c *Config) {
c.DevMode = false
c.DataDir = dir
c.RaftConfig.ProtocolVersion = 2
})
if cleanupv2 != nil {
defer cleanupv2()
}

// Downgrading Raft should prevent the server from starting.
require.Error(t, err)
}
13 changes: 10 additions & 3 deletions nomad/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/version"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
)

var (
Expand All @@ -39,6 +40,12 @@ func TestACLServer(t *testing.T, cb func(*Config)) (*Server, *structs.ACLToken,
}

func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) {
s, c, err := TestServerErr(t, cb)
require.NoError(t, err, "failed to start test server")
return s, c
}

func TestServerErr(t *testing.T, cb func(*Config)) (*Server, func(), error) {
// Setup the default settings
config := DefaultConfig()

Expand Down Expand Up @@ -137,10 +144,10 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) {
case <-time.After(1 * time.Minute):
t.Fatal("timed out while shutting down server")
}
}
}, nil
} else if i == 0 {
freeport.Return(ports)
t.Fatalf("err: %v", err)
return nil, nil, err
} else {
if server != nil {
_ = server.Shutdown()
Expand All @@ -151,7 +158,7 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) {
}
}

return nil, nil
return nil, nil, nil
}

func TestJoin(t *testing.T, servers ...*Server) {
Expand Down

0 comments on commit 0783ac6

Please sign in to comment.