From a17c49d44ed217ac42bad1a7eaa79381472627f2 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 4 Feb 2022 13:42:06 -0500 Subject: [PATCH] scheduler: recover from panic If processing a specific evaluation causes the scheduler (and therefore the entire server) to panic, that evaluation will never get a chance to be nack'd and cleared from the state store. It will get dequeued by another scheduler, causing that server to panic, and so forth until all servers are in a panic loop. This prevents the operator from intervening to remove the evaluation or update the state. Recover the goroutine from the top-level `Process` methods for each scheduler so that this condition can be detected without panicking the server process. This will lead to a loop of recovering the scheduler goroutine until the eval can be removed or nack'd, but that's much better than taking a downtime. --- scheduler/generic_sched.go | 9 ++++++++- scheduler/scheduler_system.go | 8 +++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f263d864b246..2d83caad1954 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -125,7 +125,14 @@ func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state Sta } // Process is used to handle a single evaluation -func (s *GenericScheduler) Process(eval *structs.Evaluation) error { +func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() + // Store the evaluation s.eval = eval diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index dff79aa8bc2f..fefdc0101cf7 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -72,7 +72,13 @@ func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state } // Process is used to handle a single evaluation. -func (s *SystemScheduler) Process(eval *structs.Evaluation) error { +func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler: %v", eval.ID, r) + } + }() // Store the evaluation s.eval = eval