From 674a1b52fa3951dcc5b4dbc7b2197cc0baa7d62b Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 4 Feb 2022 13:42:06 -0500 Subject: [PATCH] scheduler: recover from panic If processing a specific evaluation causes the scheduler (and therefore the entire server) to panic, that evaluation will never get a chance to be nack'd and cleared from the state store. It will get dequeued by another scheduler, causing that server to panic, and so forth until all servers are in a panic loop. This prevents the operator from intervening to remove the evaluation or update the state. Recover the goroutine from the top-level `Process` methods for each scheduler so that this condition can be detected without panicking the server process. This will lead to a loop of recovering the scheduler goroutine until the eval can be removed or nack'd, but that's much better than taking a downtime. --- .changelog/12009.txt | 3 +++ scheduler/generic_sched.go | 9 ++++++++- scheduler/scheduler_system.go | 8 +++++++- 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 .changelog/12009.txt diff --git a/.changelog/12009.txt b/.changelog/12009.txt new file mode 100644 index 000000000000..e57cbca6a451 --- /dev/null +++ b/.changelog/12009.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: recover scheduler goroutines on panic +``` diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f263d864b246..2d83caad1954 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -125,7 +125,14 @@ func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state Sta } // Process is used to handle a single evaluation -func (s *GenericScheduler) Process(eval *structs.Evaluation) error { +func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() + // Store the evaluation s.eval = eval diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index dff79aa8bc2f..fefdc0101cf7 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -72,7 +72,13 @@ func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state } // Process is used to handle a single evaluation. -func (s *SystemScheduler) Process(eval *structs.Evaluation) error { +func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() // Store the evaluation s.eval = eval