From ddf636eb114331ac2c64c178b43b1acb61b7a1e5 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Tue, 22 Mar 2022 18:59:28 -0400 Subject: [PATCH 1/3] core: store and check for Raft version changes Downgrading the Raft version protocol is not a supported operation. Checking for a downgrade is hard since this information is not stored in any persistent place. When a server re-joins a cluster with a prior Raft version, the Serf tag is updated so Nomad can't tell that the version changed. Mixed version clusters must be supported to allow for zero-downtime rolling upgrades. During this it's expected that the cluster will have mixed Raft versions. Enforcing consistency strong version consistency would disrupt this flow. The approach taken here is to store the Raft version on disk. When the server starts the `raft_protocol` value is written to the file `data_dir/raft/version`. If that file already exists, its content is checked against the current `raft_protocol` value to detect downgrades and prevent the server from starting. Any other types of errors are ignore to prevent disruptions that are outside the control of operators. The only option in cases of an invalid or corrupt file would be to delete it, making this check useless. So just overwrite its content with the new version and provide guidance on how to check that their cluster is an expected state. --- nomad/server.go | 46 ++++++++++++++++++++++++++++++++++++++++++++ nomad/server_test.go | 26 +++++++++++++++++++++++++ nomad/testing.go | 14 +++++++++++--- 3 files changed, 83 insertions(+), 3 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index 5e3d2eb51ac2..03b2a4e738e8 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1284,6 +1284,16 @@ func (s *Server) setupRaft() error { return err } + // Check Raft version and update the version file. + raftVersionFilePath := filepath.Join(path, "version") + raftVersionFileContent := fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion) + if err := s.checkRaftVersionFile(raftVersionFilePath); err != nil { + return err + } + if err := ioutil.WriteFile(raftVersionFilePath, []byte(raftVersionFileContent), 0755); err != nil { + return fmt.Errorf("failed to write Raft version file: %v", err) + } + // Create the BoltDB backend, with NoFreelistSync option store, raftErr := raftboltdb.New(raftboltdb.Options{ Path: filepath.Join(path, "raft.db"), @@ -1399,6 +1409,42 @@ func (s *Server) setupRaft() error { return nil } +// checkRaftVersionFile reads the Raft version file and returns an error if +// the Raft version is incompatible with the current version configured. +// Provide best-effort check if the file cannot be read. +func (s *Server) checkRaftVersionFile(path string) error { + raftVersion := s.config.RaftConfig.ProtocolVersion + baseWarning := "use the 'nomad operator raft list-peers' command to make sure the Raft protocol versions are consistent" + + _, err := os.Stat(path) + if err != nil { + if os.IsNotExist(err) { + s.logger.Info(fmt.Sprintf("Raft version file not found, %v", baseWarning)) + } else { + s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) + } + return nil + } + + v, err := ioutil.ReadFile(path) + if err != nil { + s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) + return nil + } + + previousVersion, err := strconv.Atoi(string(v)) + if err != nil { + s.logger.Warn(fmt.Sprintf("invalid Raft protocol version in Raft version file, %s", baseWarning), "error", err) + return nil + } + + if raft.ProtocolVersion(previousVersion) > raftVersion { + return fmt.Errorf("downgrading Raft is not supported, current version is %d, previous version was %d", raftVersion, previousVersion) + } + + return nil +} + // setupSerf is used to setup and initialize a Serf func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (*serf.Serf, error) { conf.Init() diff --git a/nomad/server_test.go b/nomad/server_test.go index db1b1091e22d..68ead1c0fc07 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -645,3 +645,29 @@ func TestServer_ReloadSchedulers_InvalidSchedulers(t *testing.T) { currentWC = s.GetSchedulerWorkerConfig() require.Equal(t, origWC, currentWC) } + +func TestServer_PreventRaftDowngrade(t *testing.T) { + ci.Parallel(t) + + dir := tmpDir(t) + defer os.RemoveAll(dir) + + _, cleanupv3 := TestServer(t, func(c *Config) { + c.DevMode = false + c.DataDir = dir + c.RaftConfig.ProtocolVersion = 3 + }) + cleanupv3() + + _, cleanupv2, err := TestServerWithErr(t, func(c *Config) { + c.DevMode = false + c.DataDir = dir + c.RaftConfig.ProtocolVersion = 2 + }) + if cleanupv2 != nil { + defer cleanupv2() + } + + // Downgrading Raft should prevent the server from starting. + require.Error(t, err) +} diff --git a/nomad/testing.go b/nomad/testing.go index 9fbe2ca02e41..7078359607f8 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -39,6 +39,14 @@ func TestACLServer(t *testing.T, cb func(*Config)) (*Server, *structs.ACLToken, } func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { + s, c, err := TestServerWithErr(t, cb) + if err != nil { + t.Fatalf("err: %v", err) + } + return s, c +} + +func TestServerWithErr(t *testing.T, cb func(*Config)) (*Server, func(), error) { // Setup the default settings config := DefaultConfig() @@ -137,10 +145,10 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { case <-time.After(1 * time.Minute): t.Fatal("timed out while shutting down server") } - } + }, nil } else if i == 0 { freeport.Return(ports) - t.Fatalf("err: %v", err) + return nil, nil, err } else { if server != nil { _ = server.Shutdown() @@ -151,7 +159,7 @@ func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { } } - return nil, nil + return nil, nil, nil } func TestJoin(t *testing.T, servers ...*Server) { From 67c112c5234315389dab694e0ccdfc57fcde6c2b Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 23 Mar 2022 15:18:23 -0400 Subject: [PATCH 2/3] changelog: add entry for #12362 --- .changelog/12362.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/12362.txt diff --git a/.changelog/12362.txt b/.changelog/12362.txt new file mode 100644 index 000000000000..7a8dcca5e585 --- /dev/null +++ b/.changelog/12362.txt @@ -0,0 +1,3 @@ +```release-note:improvement +server: store and check previous Raft protocol version to prevent downgrades +``` From 6cafda26fedd2f80dbd02261aa3567ecf9cf2ee3 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 23 Mar 2022 19:00:50 -0400 Subject: [PATCH 3/3] address some code review comments --- nomad/server.go | 12 ++++++------ nomad/server_test.go | 6 ++---- nomad/testing.go | 9 ++++----- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/nomad/server.go b/nomad/server.go index 03b2a4e738e8..30647a72f3de 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1286,11 +1286,11 @@ func (s *Server) setupRaft() error { // Check Raft version and update the version file. raftVersionFilePath := filepath.Join(path, "version") - raftVersionFileContent := fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion) + raftVersionFileContent := strconv.Itoa(int(s.config.RaftConfig.ProtocolVersion)) if err := s.checkRaftVersionFile(raftVersionFilePath); err != nil { return err } - if err := ioutil.WriteFile(raftVersionFilePath, []byte(raftVersionFileContent), 0755); err != nil { + if err := ioutil.WriteFile(raftVersionFilePath, []byte(raftVersionFileContent), 0644); err != nil { return fmt.Errorf("failed to write Raft version file: %v", err) } @@ -1419,10 +1419,10 @@ func (s *Server) checkRaftVersionFile(path string) error { _, err := os.Stat(path) if err != nil { if os.IsNotExist(err) { - s.logger.Info(fmt.Sprintf("Raft version file not found, %v", baseWarning)) - } else { - s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) + return nil } + + s.logger.Warn(fmt.Sprintf("unable to read Raft version file, %s", baseWarning), "error", err) return nil } @@ -1432,7 +1432,7 @@ func (s *Server) checkRaftVersionFile(path string) error { return nil } - previousVersion, err := strconv.Atoi(string(v)) + previousVersion, err := strconv.Atoi(strings.TrimSpace(string(v))) if err != nil { s.logger.Warn(fmt.Sprintf("invalid Raft protocol version in Raft version file, %s", baseWarning), "error", err) return nil diff --git a/nomad/server_test.go b/nomad/server_test.go index 68ead1c0fc07..858ac0dd715f 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -649,9 +649,7 @@ func TestServer_ReloadSchedulers_InvalidSchedulers(t *testing.T) { func TestServer_PreventRaftDowngrade(t *testing.T) { ci.Parallel(t) - dir := tmpDir(t) - defer os.RemoveAll(dir) - + dir := t.TempDir() _, cleanupv3 := TestServer(t, func(c *Config) { c.DevMode = false c.DataDir = dir @@ -659,7 +657,7 @@ func TestServer_PreventRaftDowngrade(t *testing.T) { }) cleanupv3() - _, cleanupv2, err := TestServerWithErr(t, func(c *Config) { + _, cleanupv2, err := TestServerErr(t, func(c *Config) { c.DevMode = false c.DataDir = dir c.RaftConfig.ProtocolVersion = 2 diff --git a/nomad/testing.go b/nomad/testing.go index 7078359607f8..3dd3d5ba2c71 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -17,6 +17,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/version" "github.com/pkg/errors" + "github.com/stretchr/testify/require" ) var ( @@ -39,14 +40,12 @@ func TestACLServer(t *testing.T, cb func(*Config)) (*Server, *structs.ACLToken, } func TestServer(t *testing.T, cb func(*Config)) (*Server, func()) { - s, c, err := TestServerWithErr(t, cb) - if err != nil { - t.Fatalf("err: %v", err) - } + s, c, err := TestServerErr(t, cb) + require.NoError(t, err, "failed to start test server") return s, c } -func TestServerWithErr(t *testing.T, cb func(*Config)) (*Server, func(), error) { +func TestServerErr(t *testing.T, cb func(*Config)) (*Server, func(), error) { // Setup the default settings config := DefaultConfig()