From f1e46e31de3b10d84ee250291ac1e05c6c0433b4 Mon Sep 17 00:00:00 2001 From: Guilhem Fanton <8671905+gfanton@users.noreply.github.com> Date: Mon, 3 Jun 2024 19:40:40 +0200 Subject: [PATCH] fix(bft): potential deadlock in the consensus ticker on close (#2264) This PR addresses a potential deadlock issue that can occur if `ScheduleTimeout` is used after the `timeoutTicker` service has been stopped. This problem has been observed in some CI jobs when stopping the node: https://github.com/gnolang/gno/actions/runs/9301771431/attempts/1 (thanks @thehowl). The stacktrace indicates that `timeoutTicker` has likely been closed before `ConsensusState`, and `ConsensusState` is getting stuck while calling `ScheduleTimeout` because the `tickChan` will never be read. The proposed solution is to simply ignore any scheduled timeouts if the `timeoutTicker` service has been closed. I've extracted the relevant part of the stacktrace: - The first stack shows the `ConsensusReactor` waiting for `ConsensusState` on stop. - The second stack shows `ConsensusState` waiting to `ScheduleTimeout` on the `timeoutTicker`. ``` coverage: 53.8% of statements panic: test timed out after 30m0s running tests: TestTestdata (30m0s) TestTestdata/gnoland (29m54s) --- goroutine 11039 [chan receive, 29 minutes]: github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).Wait(...) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:376 github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusReactor).OnStop(0xc000176a00) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/reactor.go:92 +0x75 github.com/gnolang/gno/tm2/pkg/service.(*BaseService).Stop(0xc000176a00) /home/runner/work/gno/gno/tm2/pkg/service/service.go:168 +0x22e github.com/gnolang/gno/tm2/pkg/p2p.(*Switch).OnStop(0xc000396600) --- goroutine 15586 [chan send, 29 minutes]: github.com/gnolang/gno/tm2/pkg/bft/consensus.(*timeoutTicker).ScheduleTimeout(0x1d4e957d?, {0x5ed0bdb, 0x7, 0x0, 0x1}) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/ticker.go:71 +0x30 github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).scheduleTimeout(...) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:472 github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).scheduleRound0(0xc000ff7508, 0xc000ff7598) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:467 +0x8b ... github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).tryAddVote(0xc000ff7508, 0x41bdda?, {0x0?, 0x7f11457181a8?}) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:1518 +0x2f github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).handleMsg(0xc000ff7508, {{0x12fc4c0, 0xc000fe59f0}, {0x0, 0x0}}) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:696 +0xf8 github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).receiveRoutine(0xc000ff7508, 0x0) /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:655 +0x452 created by github.com/gnolang/gno/tm2/pkg/bft/consensus.(*ConsensusState).OnStart in goroutine 11039 /home/runner/work/gno/gno/tm2/pkg/bft/consensus/state.go:349 +0x4ef --- ```
Contributors' checklist... - [ ] Added new tests, or not needed, or not feasible - [ ] Provided an example (e.g. screenshot) to aid review or the PR is self-explanatory - [ ] Updated the official documentation or not needed - [ ] No breaking changes were made, or a `BREAKING CHANGE: xxx` message was included in the description - [ ] Added references to related issues and PRs - [ ] Provided any useful hints for running manual tests - [ ] Added new benchmarks to [generated graphs](https://gnoland.github.io/benchmarks), if any. More info [here](https://github.com/gnolang/gno/blob/master/.benchmarks/README.md).
Signed-off-by: gfanton <8671905+gfanton@users.noreply.github.com> --- tm2/pkg/bft/consensus/ticker.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tm2/pkg/bft/consensus/ticker.go b/tm2/pkg/bft/consensus/ticker.go index 8448e014260..461a93e3e6e 100644 --- a/tm2/pkg/bft/consensus/ticker.go +++ b/tm2/pkg/bft/consensus/ticker.go @@ -67,8 +67,13 @@ func (t *timeoutTicker) Chan() <-chan timeoutInfo { // ScheduleTimeout schedules a new timeout by sending on the internal tickChan. // The timeoutRoutine is always available to read from tickChan, so this won't block. // The scheduling may fail if the timeoutRoutine has already scheduled a timeout for a later height/round/step. +// If the service has been closed, the timeout will be ignored. func (t *timeoutTicker) ScheduleTimeout(ti timeoutInfo) { - t.tickChan <- ti + select { + case t.tickChan <- ti: + case <-t.Quit(): + t.Logger.Warn("Unable to schedule timeout as service has been closed") + } } // -------------------------------------------------------------