From cb4ae1f7f4542d0108e315a13a8dff7e9f169594 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Mon, 14 Mar 2022 16:16:25 -0400 Subject: [PATCH 1/5] server: transfer leadership in case of error When a Nomad server becomes the Raft leader, it must perform several actions defined in the establishLeadership function. If any of these actions fail, Raft will think the node is the leader, but it will not actually be able to act as a Nomad leader. In this scenario, leadership must be revoked and transferred to another server if possible, or the node should retry the establishLeadership steps. --- nomad/leader.go | 56 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/nomad/leader.go b/nomad/leader.go index 5a50995a41f7..c2ee2f3e9a45 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -120,6 +120,27 @@ func (s *Server) monitorLeadership() { } } +func (s *Server) leadershipTransfer() error { + retryCount := 3 + for i := 0; i < retryCount; i++ { + future := s.raft.LeadershipTransfer() + if err := future.Error(); err != nil { + s.logger.Error("failed to transfer leadership attempt, will retry", + "attempt", i, + "retry_limit", retryCount, + "error", err, + ) + } else { + s.logger.Info("successfully transferred leadership", + "attempt", i, + "retry_limit", retryCount, + ) + return nil + } + } + return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount) +} + // leaderLoop runs as long as we are the leader to run various // maintenance activities func (s *Server) leaderLoop(stopCh chan struct{}) { @@ -151,7 +172,15 @@ RECONCILE: s.logger.Error("failed to revoke leadership", "error", err) } - goto WAIT + // Attempt to transfer leadership. If successful, leave the + // leaderLoop since this node is no longer the leader. Otherwise + // try to establish leadership again after 5 seconds. + if err := s.leadershipTransfer(); err != nil { + s.logger.Error("failed to transfer leadership", "error", err) + interval = time.After(5 * time.Second) + goto WAIT + } + return } establishedLeader = true @@ -182,10 +211,12 @@ RECONCILE: } WAIT: - // Wait until leadership is lost + // Wait until leadership is lost or periodically reconcile as long as we + // are the leader, or when Serf events arrive. for { select { case <-stopCh: + // Lost leadership. return case <-s.shutdownCh: return @@ -213,6 +244,27 @@ WAIT: s.revokeLeadership() err := s.establishLeadership(stopCh) errCh <- err + + // In case establishLeadership fails, try to transfer leadership. + // At this point Raft thinks we are the leader, but Nomad did not + // complete the required steps to act as the leader. + if err != nil { + if err := s.leadershipTransfer(); err != nil { + // establishedLeader was true before, but it no longer is + // since we revoked leadership and leadershipTransfer also + // failed. + // Stay in the leaderLoop with establishedLeader set to + // false so we try to establish leadership again in the + // next loop. + establishedLeader = false + interval = time.After(5 * time.Second) + goto WAIT + } + + // leadershipTransfer was successful and it is + // time to leave the leaderLoop. + return + } } } } From b15b7bec32b5e52fcc10031a341f9a6f215098ea Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Mon, 14 Mar 2022 17:02:07 -0400 Subject: [PATCH 2/5] changelog: add entry for #12293 --- .changelog/12293.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/12293.txt diff --git a/.changelog/12293.txt b/.changelog/12293.txt new file mode 100644 index 000000000000..6248b9e53cb0 --- /dev/null +++ b/.changelog/12293.txt @@ -0,0 +1,3 @@ +```release-note:improvement +server: Transfer Raft leadership in case the Nomad server fails to establish leadership +``` From 2dc1ff1f7db84f654e7f4fac059cf3470030719b Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 16 Mar 2022 18:20:45 -0400 Subject: [PATCH 3/5] return early when leadership transfer fail due to protocol version --- nomad/leader.go | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/nomad/leader.go b/nomad/leader.go index c2ee2f3e9a45..9dd3792e3ec1 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -123,20 +123,23 @@ func (s *Server) monitorLeadership() { func (s *Server) leadershipTransfer() error { retryCount := 3 for i := 0; i < retryCount; i++ { - future := s.raft.LeadershipTransfer() - if err := future.Error(); err != nil { - s.logger.Error("failed to transfer leadership attempt, will retry", - "attempt", i, - "retry_limit", retryCount, - "error", err, - ) - } else { - s.logger.Info("successfully transferred leadership", - "attempt", i, - "retry_limit", retryCount, - ) + err := s.raft.LeadershipTransfer().Error() + if err == nil { + s.logger.Info("successfully transferred leadership") return nil } + + // Don't retry if the Raft version doesn't support leadership transfer + // since this will never succeed. + if err == raft.ErrUnsupportedProtocol { + return fmt.Errorf("leadership transfer not supported with Raft version lower than 3") + } + + s.logger.Error("failed to transfer leadership attempt, will retry", + "attempt", i, + "retry_limit", retryCount, + "error", err, + ) } return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount) } From 4ffa808d68819ae16fda801566e74e442f0c6144 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 16 Mar 2022 18:21:20 -0400 Subject: [PATCH 4/5] docs: add upgrade note about raft leadership transfer --- website/content/docs/upgrade/upgrade-specific.mdx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index baa5703bdc17..b7d21daeaf33 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -51,6 +51,13 @@ The volume staging directory for new CSI plugin tasks will now be mounted to the task's `NOMAD_TASK_DIR` instead of the `csi_plugin.mount_config`. +#### Raft leadership transfer on error + +Starting with Nomad 1.3.0, when a Nomad leader fails to establish leadership it +will attempted to gracefully transfer leadership to another eligible server in +the cluster. This operation is only supported when using Raft Protocol Version +3. + #### Server Raft Database The server raft database in `raft.db` will be automatically migrated to a new From 772abb626fa77e1562e7ba789df0cdab360cfca5 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Wed, 16 Mar 2022 18:47:42 -0400 Subject: [PATCH 5/5] docs: rewrite raft leadership transfer upgrade note --- website/content/docs/upgrade/upgrade-specific.mdx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index b7d21daeaf33..c60a617776f5 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -53,10 +53,11 @@ mounted to the task's `NOMAD_TASK_DIR` instead of the #### Raft leadership transfer on error -Starting with Nomad 1.3.0, when a Nomad leader fails to establish leadership it -will attempted to gracefully transfer leadership to another eligible server in -the cluster. This operation is only supported when using Raft Protocol Version -3. +Starting with Nomad 1.3.0, when a Nomad server is elected the Raft leader but +fails to complete the process to start acting as the Nomad leader it will +attempt to gracefully transfer its Raft leadership status to another eligible +server in the cluster. This operation is only supported when using Raft +Protocol Version 3. #### Server Raft Database