Skip to content

Commit

Permalink
Merge pull request #4349 from hashicorp/b-reconcile-raft-upgrade
Browse files Browse the repository at this point in the history
Remove an unnecessary check in nomad member reconciliation
  • Loading branch information
preetapan committed May 30, 2018
2 parents b1e7eda + 024354f commit 93e7dc8
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 18 deletions.
25 changes: 8 additions & 17 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -727,11 +727,6 @@ func (s *Server) reconcileMember(member serf.Member) error {
}
defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())

// Do not reconcile ourself
if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
return nil
}

var err error
switch member.Status {
case serf.StatusAlive:
Expand Down Expand Up @@ -768,12 +763,6 @@ func (s *Server) reconcileJobSummaries() error {

// addRaftPeer is used to add a new Raft peer when a Nomad server joins
func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// Do not join ourselfs
if m.Name == s.config.NodeName {
s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name)
return nil
}

// Check for possibility of multiple bootstrap nodes
members := s.serf.Members()
if parts.Bootstrap {
Expand All @@ -786,17 +775,19 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
}
}

// See if it's already in the configuration. It's harmless to re-add it
// but we want to avoid doing that if possible to prevent useless Raft
// log entries.
// Processing ourselves could result in trying to remove ourselves to
// fix up our address, which would make us step down. This is only
// safe to attempt if there are multiple servers available.
addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
configFuture := s.raft.GetConfiguration()
if err := configFuture.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
return err
}
for _, server := range configFuture.Configuration().Servers {
if server.Address == raft.ServerAddress(addr) {

if m.Name == s.config.NodeName {
if l := len(configFuture.Configuration().Servers); l < 3 {
s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name)
return nil
}
}
Expand All @@ -817,7 +808,7 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {

// If the address or ID matches an existing server, see if we need to remove the old one first
if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
// Exit with no-op if this is being called on an existing server
// Exit with no-op if this is being called on an existing server and both the ID and address match
if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
return nil
}
Expand Down
87 changes: 87 additions & 0 deletions nomad/leader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package nomad
import (
"errors"
"fmt"
"strconv"
"testing"
"time"

Expand All @@ -13,6 +14,7 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -1112,3 +1114,88 @@ func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) {
require.Nil(t, s1.revokeLeadership())
require.Nil(t, s1.revokeLeadership())
}

// Test doing an inplace upgrade on a server from raft protocol 2 to 3
// This verifies that removing the server and adding it back with a uuid works
// even if the server's address stays the same.
func TestServer_ReconcileMember(t *testing.T) {
// Create a three node cluster
t.Parallel()
s1 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()

s2 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s2.Shutdown()

s3 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 2
})
defer s3.Shutdown()
TestJoin(t, s1, s2, s3)
testutil.WaitForLeader(t, s1.RPC)

// Create a memberlist object for s3, with raft protocol upgraded to 3
upgradedS3Member := serf.Member{
Name: s3.config.NodeName,
Addr: s3.config.RPCAddr.IP,
Status: serf.StatusAlive,
Tags: make(map[string]string),
}
upgradedS3Member.Tags["role"] = "nomad"
upgradedS3Member.Tags["id"] = s3.config.NodeID
upgradedS3Member.Tags["region"] = s3.config.Region
upgradedS3Member.Tags["dc"] = s3.config.Datacenter
upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1"
upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port)
upgradedS3Member.Tags["build"] = "0.8.0"
upgradedS3Member.Tags["vsn"] = "2"
upgradedS3Member.Tags["mvn"] = "1"
upgradedS3Member.Tags["raft_vsn"] = "3"

// Find the leader so that we can call reconcile member on it
var leader *Server
for _, s := range []*Server{s1, s2, s3} {
if s.IsLeader() {
leader = s
}
}
leader.reconcileMember(upgradedS3Member)
// This should remove s3 from the config and potentially cause a leader election
testutil.WaitForLeader(t, s1.RPC)

// Figure out the new leader and call reconcile again, this should add s3 with the new ID format
for _, s := range []*Server{s1, s2, s3} {
if s.IsLeader() {
leader = s
}
}
leader.reconcileMember(upgradedS3Member)
testutil.WaitForLeader(t, s1.RPC)
future := s2.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatal(err)
}
addrs := 0
ids := 0
for _, server := range future.Configuration().Servers {
if string(server.ID) == string(server.Address) {
addrs++
} else {
ids++
}
}
// After this, all three servers should have IDs in raft
if got, want := addrs, 0; got != want {
t.Fatalf("got %d server addresses want %d", got, want)
}
if got, want := ids, 3; got != want {
t.Fatalf("got %d server ids want %d", got, want)
}
}
13 changes: 12 additions & 1 deletion website/source/docs/upgrade/upgrade-specific.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ to match the default.

The Raft protocol must be stepped up in this way; only adjacent version numbers are
compatible (for example, version 1 cannot talk to version 3). Here is a table of the
Raft Protocol versions supported by each Consul version:
Raft Protocol versions supported by each Nomad version:

<table class="table table-bordered table-striped">
<tr>
Expand All @@ -53,6 +53,17 @@ Raft Protocol versions supported by each Consul version:
In order to enable all [Autopilot](/guides/cluster/autopilot.html) features, all servers
in a Nomad cluster must be running with Raft protocol version 3 or later.

#### Upgrading to Raft Protocol 3

This section provides details on upgrading to Raft Protocol 3 in Nomad 0.8 and higher. Raft protocol version 3 requires Nomad running 0.8.0 or newer on all servers in order to work. See [Raft Protocol Version Compatibility](/docs/upgrade/upgrade-specific.html#raft-protocol-version-compatibility) for more details. Also the format of `peers.json` used for outage recovery is different when running with the latest Raft protocol. See [Manual Recovery Using peers.json](/guides/outage.html#manual-recovery-using-peers-json) for a description of the required format.

Please note that the Raft protocol is different from Nomad's internal protocol as shown in commands like `nomad server members`. To see the version of the Raft protocol in use on each server, use the `nomad operator raft list-peers` command.

The easiest way to upgrade servers is to have each server leave the cluster, upgrade its `raft_protocol` version in the `server` stanza, and then add it back. Make sure the new server joins successfully and that the cluster is stable before rolling the upgrade forward to the next server. It's also possible to stand up a new set of servers, and then slowly stand down each of the older servers in a similar fashion.

When using Raft protocol version 3, servers are identified by their `node-id` instead of their IP address when Nomad makes changes to its internal Raft quorum configuration. This means that once a cluster has been upgraded with servers all running Raft protocol version 3, it will no longer allow servers running any older Raft protocol versions to be added. If running a single Nomad server, restarting it in-place will result in that server not being able to elect itself as a leader. To avoid this, either set the Raft protocol back to 2, or use [Manual Recovery Using peers.json](/docs/guides/outage.html#manual-recovery-using-peers-json) to map the server to its node ID in the Raft quorum configuration.


### Node Draining Improvements

Node draining via the [`node drain`][drain-cli] command or the [drain
Expand Down
36 changes: 36 additions & 0 deletions website/source/guides/outage.html.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,39 @@ nomad-server01.global 10.10.11.5:4647 10.10.11.5:4647 follower true
nomad-server02.global 10.10.11.6:4647 10.10.11.6:4647 leader true
nomad-server03.global 10.10.11.7:4647 10.10.11.7:4647 follower true
```

## Peers.json Format Changes in Raft Protocol 3
For Raft protocol version 3 and later, peers.json should be formatted as a JSON
array containing the node ID, address:port, and suffrage information of each
Nomad server in the cluster, like this:

```
[
{
"id": "adf4238a-882b-9ddc-4a9d-5b6758e4159e",
"address": "10.1.0.1:8300",
"non_voter": false
},
{
"id": "8b6dda82-3103-11e7-93ae-92361f002671",
"address": "10.1.0.2:8300",
"non_voter": false
},
{
"id": "97e17742-3103-11e7-93ae-92361f002671",
"address": "10.1.0.3:8300",
"non_voter": false
}
]
```

- `id` `(string: <required>)` - Specifies the `node ID`
of the server. This can be found in the logs when the server starts up,
and it can also be found inside the `node-id` file in the server's data directory.

- `address` `(string: <required>)` - Specifies the IP and port of the server. The port is the
server's RPC port used for cluster communications.

- `non_voter` `(bool: <false>)` - This controls whether the server is a non-voter, which is used
in some advanced [Autopilot](/guides/cluster/autopilot.html) configurations. If omitted, it will
default to false, which is typical for most clusters.

0 comments on commit 93e7dc8

Please sign in to comment.