Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dead server removal condition to use correct failure tolerance #4017

Merged
merged 6 commits into from
Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 42 additions & 28 deletions agent/consul/autopilot/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,20 @@ func (a *Autopilot) RemoveDeadServers() {
}
}

func canRemoveServers(peers, minQuorum, deadServers int) (bool, string) {
if peers-deadServers < int(minQuorum) {
return false, fmt.Sprintf("denied, because removing %d/%d servers would leave less then minimal allowed quorum of %d servers", deadServers, peers, minQuorum)
}

// Only do removals if a minority of servers will be affected.
// For failure tolerance of F we need n = 2F+1 servers.
// This means we can safely remove up to (n-1)/2 servers.
if deadServers > (peers-1)/2 {
return false, fmt.Sprintf("denied, because removing the majority of servers %d/%d is not safe", deadServers, peers)
}
return true, fmt.Sprintf("allowed, because removing %d/%d servers leaves a majority of servers above the minimal allowed quorum %d", deadServers, peers, minQuorum)
}

// pruneDeadServers removes up to numPeers/2 failed servers
func (a *Autopilot) pruneDeadServers() error {
conf := a.delegate.AutopilotConfig()
Expand Down Expand Up @@ -226,42 +240,42 @@ func (a *Autopilot) pruneDeadServers() error {
}
}

// We can bail early if there's nothing to do.
removalCount := len(failed) + len(staleRaftServers)
if removalCount == 0 {
deadServers := len(failed) + len(staleRaftServers)

// nothing to do
if deadServers == 0 {
return nil
}

// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if peers-removalCount >= int(conf.MinQuorum) && removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}
if ok, msg := canRemoveServers(NumPeers(raftConfig), int(conf.MinQuorum), deadServers); !ok {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: %s.", msg)
return nil
}

for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
if err := future.Error(); err != nil {
return err
}
if err := future.Error(); err != nil {
return err
}
} else {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
}

return nil
Expand Down
25 changes: 25 additions & 0 deletions agent/consul/autopilot/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"testing"

"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestMinRaftProtocol(t *testing.T) {
Expand Down Expand Up @@ -84,3 +85,27 @@ func TestMinRaftProtocol(t *testing.T) {
}
}
}

func TestAutopilot_canRemoveServers(t *testing.T) {
type test struct {
peers int
minQuorum int
deadServers int
ok bool
}

tests := []test{
{1, 1, 1, false},
{3, 3, 1, false},
{4, 3, 3, false},
{5, 3, 3, false},
{5, 3, 2, true},
{5, 3, 1, true},
{9, 3, 5, false},
}
for _, test := range tests {
ok, msg := canRemoveServers(test.peers, test.minQuorum, test.deadServers)
require.Equal(t, test.ok, ok)
t.Logf("%+v: %s", test, msg)
}
}
62 changes: 44 additions & 18 deletions agent/consul/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/hashicorp/consul/testrpc"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestAutopilot_IdempotentShutdown(t *testing.T) {
Expand Down Expand Up @@ -37,7 +38,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
conf := func(c *Config) {
c.Datacenter = dc
c.Bootstrap = false
c.BootstrapExpect = 3
c.BootstrapExpect = 5
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
dir1, s1 := testServerWithConfig(t, conf)
Expand All @@ -52,43 +53,68 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
defer os.RemoveAll(dir3)
defer s3.Shutdown()

servers := []*Server{s1, s2, s3}
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

dir5, s5 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir5)
defer s5.Shutdown()

servers := []*Server{s1, s2, s3, s4, s5}

// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
joinLAN(t, s5, s1)

for _, s := range servers {
testrpc.WaitForLeader(t, s.RPC, dc)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
}

// Bring up a new server
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
require := require.New(t)
testrpc.WaitForLeader(t, s1.RPC, "dc1")
leaderIndex := -1
for i, s := range servers {
if s.IsLeader() {
leaderIndex = i
break
}
}
require.NotEqual(leaderIndex, -1)

// Shutdown two non-leader servers
killed := make(map[string]struct{})
for i, s := range servers {
if i != leaderIndex {
s.Shutdown()
killed[string(s.config.NodeID)] = struct{}{}
}
if len(killed) == 2 {
break
}
}

// Kill a non-leader server
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.LANMembers() {
for _, m := range servers[leaderIndex].LANMembers() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatal(nil)
if alive != 3 {
r.Fatalf("Expected three alive servers instead of %d", alive)
}
})

// Join the new server
joinLAN(t, s4, s1)
servers[2] = s4

// Make sure the dead server is removed and we're back to 3 total peers
// Make sure the dead servers are removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
_, killed := killed[string(s.config.NodeID)]
if !killed {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
}

Expand Down