Skip to content

Commit

Permalink
autopilot: include only servers from the same region
Browse files Browse the repository at this point in the history
When we migrated to the updated autopilot library in Nomad 1.4.0, the interface
for finding servers changed. Previously autopilot would get the serf members and
call `IsServer` on each of them, leaving it up to the implementor to filter out
clients (and in Nomad's case, other regions). But in the "new" autopilot
library, the equivalent interface is `KnownServers` for which we did not filter
by region. This causes spurious attempts for the cross-region stats fetching,
which results in TLS errors and a lot of log noise.

Filter the member set by region to fix the regression.
  • Loading branch information
tgross committed Nov 17, 2022
1 parent 732adae commit 662f6db
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .changelog/15290.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
autopilot: Fixed a bug where autopilot would try to fetch raft stats from other regions
```
5 changes: 4 additions & 1 deletion nomad/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server {
s.logger.Warn("Error parsing server info", "name", member.Name, "error", err)
continue
} else if srv == nil {
// this member was a client
// this member was a client or in another region
continue
}

Expand All @@ -210,6 +210,9 @@ func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) {
if !ok {
return nil, nil
}
if srv.Region != s.Region() {
return nil, nil
}

return s.autopilotServerFromMetadata(srv)
}
Expand Down
36 changes: 36 additions & 0 deletions nomad/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,42 @@ func TestAutopilot_RollingUpdate(t *testing.T) {
waitForStableLeadership(t, servers)
}

func TestAutopilot_MultiRegion(t *testing.T) {
ci.Parallel(t)

conf := func(c *Config) {
c.NumSchedulers = 0 // reduces test log noise
c.BootstrapExpect = 3
}
s1, cleanupS1 := TestServer(t, conf)
defer cleanupS1()

s2, cleanupS2 := TestServer(t, conf)
defer cleanupS2()

s3, cleanupS3 := TestServer(t, conf)
defer cleanupS3()

// federated regions should not be considered raft peers or show up in the
// known servers list
s4, cleanupS4 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 0
c.Region = "other"
})
defer cleanupS4()

servers := []*Server{s1, s2, s3}
TestJoin(t, s1, s2, s3, s4)

t.Logf("waiting for initial stable cluster")
waitForStableLeadership(t, servers)

apDelegate := &AutopilotDelegate{s3}
known := apDelegate.KnownServers()
must.Eq(t, 3, len(known))

}

func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
ci.Parallel(t)

Expand Down

0 comments on commit 662f6db

Please sign in to comment.