From 662f6db7ee69aaa062fef97e675a8347520c72c5 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 17 Nov 2022 11:22:13 -0500 Subject: [PATCH] autopilot: include only servers from the same region When we migrated to the updated autopilot library in Nomad 1.4.0, the interface for finding servers changed. Previously autopilot would get the serf members and call `IsServer` on each of them, leaving it up to the implementor to filter out clients (and in Nomad's case, other regions). But in the "new" autopilot library, the equivalent interface is `KnownServers` for which we did not filter by region. This causes spurious attempts for the cross-region stats fetching, which results in TLS errors and a lot of log noise. Filter the member set by region to fix the regression. --- .changelog/15290.txt | 3 +++ nomad/autopilot.go | 5 ++++- nomad/autopilot_test.go | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 .changelog/15290.txt diff --git a/.changelog/15290.txt b/.changelog/15290.txt new file mode 100644 index 000000000000..a7405ffe4b28 --- /dev/null +++ b/.changelog/15290.txt @@ -0,0 +1,3 @@ +```release-note:bug +autopilot: Fixed a bug where autopilot would try to fetch raft stats from other regions +``` diff --git a/nomad/autopilot.go b/nomad/autopilot.go index c5a382ecebb7..52a303e50995 100644 --- a/nomad/autopilot.go +++ b/nomad/autopilot.go @@ -195,7 +195,7 @@ func (s *Server) autopilotServers() map[raft.ServerID]*autopilot.Server { s.logger.Warn("Error parsing server info", "name", member.Name, "error", err) continue } else if srv == nil { - // this member was a client + // this member was a client or in another region continue } @@ -210,6 +210,9 @@ func (s *Server) autopilotServer(m serf.Member) (*autopilot.Server, error) { if !ok { return nil, nil } + if srv.Region != s.Region() { + return nil, nil + } return s.autopilotServerFromMetadata(srv) } diff --git a/nomad/autopilot_test.go b/nomad/autopilot_test.go index fb2afbe08603..4b417d3db405 100644 --- a/nomad/autopilot_test.go +++ b/nomad/autopilot_test.go @@ -190,6 +190,42 @@ func TestAutopilot_RollingUpdate(t *testing.T) { waitForStableLeadership(t, servers) } +func TestAutopilot_MultiRegion(t *testing.T) { + ci.Parallel(t) + + conf := func(c *Config) { + c.NumSchedulers = 0 // reduces test log noise + c.BootstrapExpect = 3 + } + s1, cleanupS1 := TestServer(t, conf) + defer cleanupS1() + + s2, cleanupS2 := TestServer(t, conf) + defer cleanupS2() + + s3, cleanupS3 := TestServer(t, conf) + defer cleanupS3() + + // federated regions should not be considered raft peers or show up in the + // known servers list + s4, cleanupS4 := TestServer(t, func(c *Config) { + c.BootstrapExpect = 0 + c.Region = "other" + }) + defer cleanupS4() + + servers := []*Server{s1, s2, s3} + TestJoin(t, s1, s2, s3, s4) + + t.Logf("waiting for initial stable cluster") + waitForStableLeadership(t, servers) + + apDelegate := &AutopilotDelegate{s3} + known := apDelegate.KnownServers() + must.Eq(t, 3, len(known)) + +} + func TestAutopilot_CleanupStaleRaftServer(t *testing.T) { ci.Parallel(t)