From 6465f94a2d26cdacc232fddc20f98d98df61ddac Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Sun, 3 Mar 2024 00:07:54 +0800 Subject: [PATCH] Fix queue worker incorrectly stopped when there are still more items in the queue (#29532) Without `case <-t.C`, the workers would stop incorrectly, the test won't pass. For the worse case, there might be only one running worker processing the queue items for long time because other workers are stopped. The root cause is related to the logic of doDispatchBatchToWorker. It isn't a serious problem at the moment, so keep it as-is. --- modules/queue/workergroup.go | 20 ++++++++++++++++---- modules/queue/workerqueue.go | 2 ++ modules/queue/workerqueue_test.go | 29 ++++++++++++++++++++++++----- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/modules/queue/workergroup.go b/modules/queue/workergroup.go index 147a4f335e10..e3801ef2b2dd 100644 --- a/modules/queue/workergroup.go +++ b/modules/queue/workergroup.go @@ -60,6 +60,9 @@ func (q *WorkerPoolQueue[T]) doDispatchBatchToWorker(wg *workerGroup[T], flushCh full = true } + // TODO: the logic could be improved in the future, to avoid a data-race between "doStartNewWorker" and "workerNum" + // The root problem is that if we skip "doStartNewWorker" here, the "workerNum" might be decreased by other workers later + // So ideally, it should check whether there are enough workers by some approaches, and start new workers if necessary. q.workerNumMu.Lock() noWorker := q.workerNum == 0 if full || noWorker { @@ -143,7 +146,11 @@ func (q *WorkerPoolQueue[T]) doStartNewWorker(wp *workerGroup[T]) { log.Debug("Queue %q starts new worker", q.GetName()) defer log.Debug("Queue %q stops idle worker", q.GetName()) + atomic.AddInt32(&q.workerStartedCounter, 1) // Only increase counter, used for debugging + t := time.NewTicker(workerIdleDuration) + defer t.Stop() + keepWorking := true stopWorking := func() { q.workerNumMu.Lock() @@ -158,13 +165,18 @@ func (q *WorkerPoolQueue[T]) doStartNewWorker(wp *workerGroup[T]) { case batch, ok := <-q.batchChan: if !ok { stopWorking() - } else { - q.doWorkerHandle(batch) - t.Reset(workerIdleDuration) + continue + } + q.doWorkerHandle(batch) + // reset the idle ticker, and drain the tick after reset in case a tick is already triggered + t.Reset(workerIdleDuration) + select { + case <-t.C: + default: } case <-t.C: q.workerNumMu.Lock() - keepWorking = q.workerNum <= 1 + keepWorking = q.workerNum <= 1 // keep the last worker running if !keepWorking { q.workerNum-- } diff --git a/modules/queue/workerqueue.go b/modules/queue/workerqueue.go index b28fd880270a..4160622d8138 100644 --- a/modules/queue/workerqueue.go +++ b/modules/queue/workerqueue.go @@ -40,6 +40,8 @@ type WorkerPoolQueue[T any] struct { workerMaxNum int workerActiveNum int workerNumMu sync.Mutex + + workerStartedCounter int32 } type flushType chan struct{} diff --git a/modules/queue/workerqueue_test.go b/modules/queue/workerqueue_test.go index e60120162a70..e09669c54255 100644 --- a/modules/queue/workerqueue_test.go +++ b/modules/queue/workerqueue_test.go @@ -11,6 +11,7 @@ import ( "time" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" "github.com/stretchr/testify/assert" ) @@ -175,11 +176,7 @@ func testWorkerPoolQueuePersistence(t *testing.T, queueSetting setting.QueueSett } func TestWorkerPoolQueueActiveWorkers(t *testing.T) { - oldWorkerIdleDuration := workerIdleDuration - workerIdleDuration = 300 * time.Millisecond - defer func() { - workerIdleDuration = oldWorkerIdleDuration - }() + defer test.MockVariableValue(&workerIdleDuration, 300*time.Millisecond)() handler := func(items ...int) (unhandled []int) { time.Sleep(100 * time.Millisecond) @@ -250,3 +247,25 @@ func TestWorkerPoolQueueShutdown(t *testing.T) { q, _ = newWorkerPoolQueueForTest("test-workpoolqueue", qs, handler, false) assert.EqualValues(t, 20, q.GetQueueItemNumber()) } + +func TestWorkerPoolQueueWorkerIdleReset(t *testing.T) { + defer test.MockVariableValue(&workerIdleDuration, 10*time.Millisecond)() + + handler := func(items ...int) (unhandled []int) { + time.Sleep(50 * time.Millisecond) + return nil + } + + q, _ := newWorkerPoolQueueForTest("test-workpoolqueue", setting.QueueSettings{Type: "channel", BatchLength: 1, MaxWorkers: 2, Length: 100}, handler, false) + stop := runWorkerPoolQueue(q) + for i := 0; i < 20; i++ { + assert.NoError(t, q.Push(i)) + } + + time.Sleep(500 * time.Millisecond) + assert.EqualValues(t, 2, q.GetWorkerNumber()) + assert.EqualValues(t, 2, q.GetWorkerActiveNumber()) + // when the queue never becomes empty, the existing workers should keep working + assert.EqualValues(t, 2, q.workerStartedCounter) + stop() +}