Skip to content

Commit

Permalink
Make dead server removal condition in autopilot use correct failure t…
Browse files Browse the repository at this point in the history
…olerance rules
  • Loading branch information
preetapan committed Apr 6, 2018
1 parent 0a4ff40 commit 18e0726
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 18 deletions.
4 changes: 3 additions & 1 deletion agent/consul/autopilot/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,9 @@ func (a *Autopilot) pruneDeadServers() error {

// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if removalCount < peers/2 {
// For failure tolerance of F we need n = 2F+1 servers.
// This means we can safely remove up to (n-1)/2 servers
if removalCount <= (peers-1)/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node)
go serfLAN.RemoveFailedNode(node)
Expand Down
60 changes: 43 additions & 17 deletions agent/consul/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/hashicorp/consul/testutil/retry"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestAutopilot_IdempotentShutdown(t *testing.T) {
Expand Down Expand Up @@ -36,7 +37,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
conf := func(c *Config) {
c.Datacenter = "dc1"
c.Bootstrap = false
c.BootstrapExpect = 3
c.BootstrapExpect = 5
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
dir1, s1 := testServerWithConfig(t, conf)
Expand All @@ -51,42 +52,67 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
defer os.RemoveAll(dir3)
defer s3.Shutdown()

servers := []*Server{s1, s2, s3}
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()

dir5, s5 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir5)
defer s5.Shutdown()

servers := []*Server{s1, s2, s3, s4, s5}

// Try to join
joinLAN(t, s2, s1)
joinLAN(t, s3, s1)
joinLAN(t, s4, s1)
joinLAN(t, s5, s1)

for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) })
}

// Bring up a new server
dir4, s4 := testServerWithConfig(t, conf)
defer os.RemoveAll(dir4)
defer s4.Shutdown()
require := require.New(t)
testrpc.WaitForLeader(t, s1.RPC, "dc1")
leaderIndex := -1
for i, s := range servers {
if s.IsLeader() {
leaderIndex = i
break
}
}
require.NotEqual(leaderIndex, -1)

// Shutdown two non-leader servers
killed := make(map[string]struct{})
for i, s := range servers {
if i != leaderIndex {
s.Shutdown()
killed[string(s.config.NodeID)] = struct{}{}
}
if len(killed) == 2 {
break
}
}

// Kill a non-leader server
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.LANMembers() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatal(nil)
if alive != 3 {
r.Fatal("Expected three alive servers")
}
})

// Join the new server
joinLAN(t, s4, s1)
servers[2] = s4

// Make sure the dead server is removed and we're back to 3 total peers
// Make sure the dead servers are removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
_, killed := killed[string(s.config.NodeID)]
if !killed {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
}

Expand Down

0 comments on commit 18e0726

Please sign in to comment.