From cff3c9b874bfdae38e6dc53de3a2fa1d2b8065af Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 12 Jun 2023 08:53:56 -0400 Subject: [PATCH] replication: fix potential panic during upgrades (#17476) If the authoritative region has been upgraded to a version of Nomad that has new replicated objects (such as ACL Auth Methods, ACL Binding Rules, etc.), the non-authoritative regions will start replicating those objects as soon as their leader is upgraded. If a server in the non-authoritative region is upgraded and then becomes the leader before all the other servers in the region have been upgraded, then it will attempt to write a Raft log entry that the followers don't understand. The followers will then panic. Add same the minimum version checks that we do for RPC writes to the leader's replication loop. --- .changelog/17476.txt | 3 +++ nomad/leader.go | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 .changelog/17476.txt diff --git a/.changelog/17476.txt b/.changelog/17476.txt new file mode 100644 index 000000000000..f0de4c832ce5 --- /dev/null +++ b/.changelog/17476.txt @@ -0,0 +1,3 @@ +```release-note:bug +replication: Fix a potential panic when a non-authoritative region is upgraded and a server with the new version becomes the leader. +``` diff --git a/nomad/leader.go b/nomad/leader.go index e6c7b760201a..ff07d5c77b66 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -58,7 +58,7 @@ var minACLRoleVersion = version.Must(version.NewVersion("1.4.0")) // minACLAuthMethodVersion is the Nomad version at which the ACL auth methods // table was introduced. It forms the minimum version all federated servers must // meet before the feature can be used. -var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0-beta.1")) +var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0")) // minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type // was introduced. It forms the minimum version all federated servers must @@ -68,7 +68,7 @@ var minACLJWTAuthMethodVersion = version.Must(version.NewVersion("1.5.4")) // minACLBindingRuleVersion is the Nomad version at which the ACL binding rules // table was introduced. It forms the minimum version all federated servers // must meet before the feature can be used. -var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0-beta.1")) +var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0")) // minNomadServiceRegistrationVersion is the Nomad version at which the service // registrations table was introduced. It forms the minimum version all local @@ -1848,6 +1848,17 @@ func (s *Server) replicateACLRoles(stopCh chan struct{}) { // parameters are controlled internally. _ = limiter.Wait(context.Background()) + if !ServersMeetMinimumVersion( + s.serf.Members(), s.Region(), minACLRoleVersion, true) { + s.logger.Trace( + "all servers must be upgraded to 1.4.0 or later before ACL Roles can be replicated") + if s.replicationBackoffContinue(stopCh) { + continue + } else { + return + } + } + // Set the replication token on each replication iteration so that // it is always current and can handle agent SIGHUP reloads. req.AuthToken = s.ReplicationToken() @@ -2046,6 +2057,17 @@ func (s *Server) replicateACLAuthMethods(stopCh chan struct{}) { // parameters are controlled internally. _ = limiter.Wait(context.Background()) + if !ServersMeetMinimumVersion( + s.serf.Members(), s.Region(), minACLAuthMethodVersion, true) { + s.logger.Trace( + "all servers must be upgraded to 1.5.0 or later before ACL Auth Methods can be replicated") + if s.replicationBackoffContinue(stopCh) { + continue + } else { + return + } + } + // Set the replication token on each replication iteration so that // it is always current and can handle agent SIGHUP reloads. req.AuthToken = s.ReplicationToken() @@ -2241,6 +2263,17 @@ func (s *Server) replicateACLBindingRules(stopCh chan struct{}) { // parameters are controlled internally. _ = limiter.Wait(context.Background()) + if !ServersMeetMinimumVersion( + s.serf.Members(), s.Region(), minACLBindingRuleVersion, true) { + s.logger.Trace( + "all servers must be upgraded to 1.5.0 or later before ACL Binding Rules can be replicated") + if s.replicationBackoffContinue(stopCh) { + continue + } else { + return + } + } + // Set the replication token on each replication iteration so that // it is always current and can handle agent SIGHUP reloads. req.AuthToken = s.ReplicationToken()