Skip to content

Commit

Permalink
VAULT-14048: raft-autopilot appears to refuse to remove a node which …
Browse files Browse the repository at this point in the history
…has left and wouldn't impact stability (hashicorp#19472)

* ensure we supply the node type when it's for a voter
* bumped autopilot version back to v0.2.0 and ran go mod tidy
* changed condition in knownservers and added some comments
* Export GetRaftBackend
* Updated tests for autopilot (related to dead server cleanup)
* Export Raft NewDelegate

Co-authored-by: Nick Cabatoff <ncabatoff@hashicorp.com>
  • Loading branch information
Peter Wilson and ncabatoff authored Apr 3, 2023
1 parent 1cef47d commit 2054ffc
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 59 deletions.
3 changes: 3 additions & 0 deletions changelog/19472.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
autopilot: Update version to v0.2.0 to add better support for respecting min quorum
```
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ require (
github.com/hashicorp/hcp-sdk-go v0.23.0
github.com/hashicorp/nomad/api v0.0.0-20220707195938-75f4c2237b28
github.com/hashicorp/raft v1.3.10
github.com/hashicorp/raft-autopilot v0.1.6
github.com/hashicorp/raft-autopilot v0.2.0
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c
github.com/hashicorp/raft-snapshot v1.0.4
github.com/hashicorp/vault-plugin-auth-alicloud v0.14.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -939,8 +939,8 @@ github.com/hashicorp/raft v1.1.2-0.20191002163536-9c6bd3e3eb17/go.mod h1:vPAJM8A
github.com/hashicorp/raft v1.2.0/go.mod h1:vPAJM8Asw6u8LxC3eJCUZmRP/E4QmUGE1R7g7k8sG/8=
github.com/hashicorp/raft v1.3.10 h1:LR5QZX1VQd0DFWZfeCwWawyeKfpS/Tm1yjnJIY5X4Tw=
github.com/hashicorp/raft v1.3.10/go.mod h1:J8naEwc6XaaCfts7+28whSeRvCqTd6e20BlCU3LtEO4=
github.com/hashicorp/raft-autopilot v0.1.6 h1:C1q3RNF2FfXNZfHWbvVAu0QixaQK8K5pX4O5lh+9z4I=
github.com/hashicorp/raft-autopilot v0.1.6/go.mod h1:Af4jZBwaNOI+tXfIqIdbcAnh/UyyqIMj/pOISIfhArw=
github.com/hashicorp/raft-autopilot v0.2.0 h1:2/R2RPgamgRKgNWGQioULZvjeKXQZmDuw5Ty+6c+H7Y=
github.com/hashicorp/raft-autopilot v0.2.0/go.mod h1:q6tZ8UAZ5xio2gv2JvjgmtOlh80M6ic8xQYBe2Egkg8=
github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk=
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c h1:oiKun9QlrOz5yQxMZJ3tf1kWtFYuKSJzxzEDxDPevj4=
github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210421194847-a7e34179d62c/go.mod h1:kiPs9g148eLShc2TYagUAyKDnD+dH9U+CQKsXzlY9xo=
Expand Down
31 changes: 31 additions & 0 deletions helper/testhelpers/testhelpers_oss.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,34 @@ func WaitForActiveNodeAndStandbys(t testing.T, cluster *vault.TestCluster) {
}
}
}

// WaitForNodesExcludingSelectedStandbys is variation on WaitForActiveNodeAndStandbys.
// It waits for the active node before waiting for standby nodes, however
// it will not wait for cores with indexes that match those specified as arguments.
// Whilst you could specify index 0 which is likely to be the leader node, the function
// checks for the leader first regardless of the indexes to skip, so it would be redundant to do so.
// The intention/use case for this function is to allow a cluster to start and become active with one
// or more nodes not joined, so that we can test scenarios where a node joins later.
// e.g. 4 nodes in the cluster, only 3 nodes in cluster 'active', 1 node can be joined later in tests.
func WaitForNodesExcludingSelectedStandbys(t testing.T, cluster *vault.TestCluster, indexesToSkip ...int) {
WaitForActiveNode(t, cluster)

contains := func(elems []int, e int) bool {
for _, v := range elems {
if v == e {
return true
}
}

return false
}
for i, core := range cluster.Cores {
if contains(indexesToSkip, i) {
continue
}

if standby, _ := core.Core.Standby(); standby {
WaitForStandbyNode(t, core)
}
}
}
19 changes: 17 additions & 2 deletions physical/raft/raft_autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ type Delegate struct {
emptyVersionLogs map[raft.ServerID]struct{}
}

func newDelegate(b *RaftBackend) *Delegate {
func NewDelegate(b *RaftBackend) *Delegate {
return &Delegate{
RaftBackend: b,
inflightRemovals: make(map[raft.ServerID]bool),
Expand Down Expand Up @@ -385,6 +385,7 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
return nil
}

apServerStates := d.autopilot.GetState().Servers
servers := future.Configuration().Servers
serverIDs := make([]string, 0, len(servers))
for _, server := range servers {
Expand Down Expand Up @@ -428,6 +429,19 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
Ext: d.autopilotServerExt(state),
}

// As KnownServers is a delegate called by autopilot let's check if we already
// had this data in the correct format and use it. If we don't (which sounds a
// bit sad, unless this ISN'T a voter) then as a fail-safe, let's try what we've
// done elsewhere in code to check the desired suffrage and manually set NodeType
// based on whether that's a voter or not. If we don't do either of these
// things, NodeType isn't set which means technically it's not a voter.
// It shouldn't be a voter and end up in this state.
if apServerState, found := apServerStates[raft.ServerID(id)]; found && apServerState.Server.NodeType != "" {
server.NodeType = apServerState.Server.NodeType
} else if state.DesiredSuffrage == "voter" {
server.NodeType = autopilot.NodeVoter
}

switch state.IsDead.Load() {
case true:
d.logger.Debug("informing autopilot that the node left", "id", id)
Expand All @@ -445,6 +459,7 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
Name: d.localID,
RaftVersion: raft.ProtocolVersionMax,
NodeStatus: autopilot.NodeAlive,
NodeType: autopilot.NodeVoter, // The leader must be a voter
Meta: d.meta(&FollowerState{
UpgradeVersion: d.EffectiveVersion(),
RedundancyZone: d.RedundancyZone(),
Expand Down Expand Up @@ -820,7 +835,7 @@ func (b *RaftBackend) SetupAutopilot(ctx context.Context, storageConfig *Autopil
if b.autopilotUpdateInterval != 0 {
options = append(options, autopilot.WithUpdateInterval(b.autopilotUpdateInterval))
}
b.autopilot = autopilot.New(b.raft, newDelegate(b), options...)
b.autopilot = autopilot.New(b.raft, NewDelegate(b), options...)
b.followerStates = followerStates
b.followerHeartbeatTicker = time.NewTicker(1 * time.Second)

Expand Down
Loading

0 comments on commit 2054ffc

Please sign in to comment.