Skip to content

Commit

Permalink
check if server is in configuration when receiving a voteRequest (#526)
Browse files Browse the repository at this point in the history
* check if server is in configuration, and not have vote rights, when receiving a voteRequest

* change test term to 20 to reduce the chance that the test passes when the bug manifest

Co-authored-by: Nick Cabatoff <ncabatoff@hashicorp.com>

* non voter with higher term vote request would make the node step-down but don't grant a vote

* fix test to check we use the right term

* add more details in the comment and a reference to the PR

Co-authored-by: Nick Cabatoff <ncabatoff@hashicorp.com>
  • Loading branch information
dhiaayachi and ncabatoff authored Oct 3, 2022
1 parent 24e68f8 commit 6b4e320
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 2 deletions.
11 changes: 11 additions & 0 deletions configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,17 @@ func hasVote(configuration Configuration, id ServerID) bool {
return false
}

// inConfiguration returns true if the server identified by 'id' is in in the
// provided Configuration.
func inConfiguration(configuration Configuration, id ServerID) bool {
for _, server := range configuration.Servers {
if server.ID == id {
return true
}
}
return false
}

// checkConfiguration tests a cluster membership configuration for common
// errors.
func checkConfiguration(configuration Configuration) error {
Expand Down
16 changes: 14 additions & 2 deletions raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1566,8 +1566,8 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
candidateID := ServerID(req.ID)
// if the Servers list is empty that mean the cluster is very likely trying to bootstrap,
// Grant the vote
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not a voter",
if len(r.configurations.latest.Servers) > 0 && !inConfiguration(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not in configuration",
"from", candidate)
return
}
Expand All @@ -1594,6 +1594,18 @@ func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) {
resp.Term = req.Term
}

// if we get a request for vote from a nonVoter and the request term is higher,
// step down and update term, but reject the vote request
// This could happen when a node, previously voter, is converted to non-voter
// The reason we need to step in is to permit to the cluster to make progress in such a scenario
// More details about that in https://github.com/hashicorp/raft/pull/526
if len(req.ID) > 0 {
candidateID := ServerID(req.ID)
if len(r.configurations.latest.Servers) > 0 && !hasVote(r.configurations.latest, candidateID) {
r.logger.Warn("rejecting vote request since node is not a voter", "from", candidate)
return
}
}
// Check if we have voted yet
lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm)
if err != nil && err.Error() != "not found" {
Expand Down
90 changes: 90 additions & 0 deletions raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2644,6 +2644,96 @@ func TestRaft_VoteNotGranted_WhenNodeNotInCluster(t *testing.T) {
}
}

func TestRaft_ClusterCanRegainStability_WhenNonVoterWithHigherTermJoin(t *testing.T) {
// Make a cluster
c := MakeCluster(3, t, nil)

defer c.Close()

// Get the leader
leader := c.Leader()

// Wait until we have 2 followers
limit := time.Now().Add(c.longstopTimeout)
var followers []*Raft
for time.Now().Before(limit) && len(followers) != 2 {
c.WaitEvent(nil, c.conf.CommitTimeout)
followers = c.GetInState(Follower)
}
if len(followers) != 2 {
t.Fatalf("expected two followers: %v", followers)
}

// Remove a follower
followerRemoved := followers[0]
c.Disconnect(followerRemoved.localAddr)
time.Sleep(c.propagateTimeout)

future := leader.RemoveServer(followerRemoved.localID, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}

//set that follower term to higher term to faster simulate a partitioning
newTerm := leader.getCurrentTerm() + 20
followerRemoved.setCurrentTerm(newTerm)
//Add the node back as NonVoter
future = leader.AddNonvoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}

c.FullyConnect()

// Wait a while
time.Sleep(c.propagateTimeout)
// Check the term is now a new term
leader = c.Leader()
currentTerm := leader.getCurrentTerm()
if newTerm > currentTerm {
t.Fatalf("term should have changed,%d < %d", newTerm, currentTerm)
}

// check nonVoter is not elected
if leader.localID == followerRemoved.localID {
t.Fatalf("Should not be leader %s", followerRemoved.localID)
}

//Write some logs to ensure they replicate
for i := 0; i < 100; i++ {
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
if err := future.Error(); err != nil {
t.Fatalf("[ERR] apply err: %v", err)
}
}
c.WaitForReplication(100)

//Remove the server and add it back as Voter
future = leader.RemoveServer(followerRemoved.localID, 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
leader.AddVoter(followerRemoved.localID, followerRemoved.localAddr, 0, 0)

// Wait a while
time.Sleep(c.propagateTimeout * 10)

//Write some logs to ensure they replicate
for i := 100; i < 200; i++ {
future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0)
if err := future.Error(); err != nil {
t.Fatalf("[ERR] apply err: %v", err)
}
}
c.WaitForReplication(200)

// Check leader stable
newLeader := c.Leader()
if newLeader.leaderID != leader.leaderID {
t.Fatalf("leader changed")
}
}

// TestRaft_FollowerRemovalNoElection ensures that a leader election is not
// started when a standby is shut down and restarted.
func TestRaft_FollowerRemovalNoElection(t *testing.T) {
Expand Down

0 comments on commit 6b4e320

Please sign in to comment.