From beadd2ae5fcc9fe3789930103c2a8000aed0c669 Mon Sep 17 00:00:00 2001 From: Lawrence Jones Date: Fri, 17 May 2019 16:18:33 +0100 Subject: [PATCH] Fix for hanging sentinels If etcd returned an error, our previous behaviour would be to report the error and restart an election. The outer loop never responded to the error by stopping the election, which would cancel the context, and instead would restart a new campaign. The first campaigns session will continue to exist on the context the election loop had originally provided it, which is context.Background. We'll come round to perform a new leadership election and the etcdElection.Campaign() method will block waiting for all existing leased keys to be removed from the prefix. Because the old campaign sessions are still alive, they'll continue to renew their leases and ensure those revisions never get deleted, thus blocking our sentinel campaign loop indefinitely. After discussion in #659, it became clear that most of the store implementations are likely to have similar issues, and instead of fixing the underlying implementations we decided to amend the interface to warn consumers that they need to terminate existing elections before starting new ones. This is a sharp edge of the election API and one that might catch us in a future refactor. This is balanced against a hope that we'll refactor the election interface before a second use case is likely to arise. --- cmd/sentinel/cmd/sentinel.go | 6 ++++++ internal/store/etcdv3.go | 5 ----- internal/store/store.go | 5 +++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go index 100e381bb..541f6f09f 100644 --- a/cmd/sentinel/cmd/sentinel.go +++ b/cmd/sentinel/cmd/sentinel.go @@ -98,6 +98,12 @@ func (s *Sentinel) electionLoop(ctx context.Context) { case err := <-errCh: if err != nil { log.Errorw("election loop error", zap.Error(err)) + + // It's important to Stop() any on-going elections, as most stores will block + // until all previous elections have completed. If we continue without stopping, + // we run the risk of preventing any subsequent elections from successfully + // electing a leader. + s.election.Stop() } goto end case <-ctx.Done(): diff --git a/internal/store/etcdv3.go b/internal/store/etcdv3.go index 135782f2c..895e722e0 100644 --- a/internal/store/etcdv3.go +++ b/internal/store/etcdv3.go @@ -199,14 +199,12 @@ func (e *etcdv3Election) campaign() { e.electedCh <- false s, err := concurrency.NewSession(e.c, concurrency.WithTTL(int(e.ttl.Seconds())), concurrency.WithContext(e.ctx)) if err != nil { - e.running = false e.errCh <- err return } etcdElection := concurrency.NewElection(s, e.path) if err = etcdElection.Campaign(e.ctx, e.candidateUID); err != nil { - e.running = false e.errCh <- err return } @@ -215,11 +213,8 @@ func (e *etcdv3Election) campaign() { select { case <-e.ctx.Done(): - e.running = false - etcdElection.Resign(context.TODO()) return case <-s.Done(): - etcdElection.Resign(context.TODO()) e.electedCh <- false } } diff --git a/internal/store/store.go b/internal/store/store.go index 6becbb4e7..5776fa56e 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -47,6 +47,11 @@ type Election interface { // TODO(sgotti) this mimics the current docker/leadership API and the etcdv3 // implementations adapt to it. In future it could be replaced with a better // api like the current one implemented by etcdclientv3/concurrency. + // + // WARNING: If the election error channel receives any error, it is vital that + // the consuming code calls election.Stop(). Failure to do so can cause + // subsequent elections to hang indefinitely across all participants of an + // election. RunForElection() (<-chan bool, <-chan error) Leader() (string, error) Stop()