tikv · ti-chi-bot · Oct 9, 2023 · Sep 26, 2023 · Sep 26, 2023 · Oct 8, 2023
diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go
@@ -47,6 +47,7 @@ type Cluster struct {
 	checkMembershipCh chan struct{}
 	apiServerLeader   atomic.Value
 	clusterID         uint64
+	running           atomic.Bool
 }
 
 const regionLabelGCInterval = time.Hour
@@ -215,6 +216,9 @@ func (c *Cluster) updateScheduler() {
 	// Make sure the check will be triggered once later.
 	notifier <- struct{}{}
 	c.persistConfig.SetSchedulersUpdatingNotifier(notifier)
+	ticker := time.NewTicker(time.Second)
+	defer ticker.Stop()
+
 	for {
 		select {
 		case <-c.ctx.Done():
@@ -224,6 +228,18 @@ func (c *Cluster) updateScheduler() {
 			// This is triggered by the watcher when the schedulers are updated.
 		}
 
+		if !c.running.Load() {
+			select {
+			case <-c.ctx.Done():
+				log.Info("cluster is closing, stop listening the schedulers updating notifier")
+				return
+			case <-ticker.C:
+				// retry
+				notifier <- struct{}{}
+				continue
+			}
+		}
+
 		log.Info("schedulers updating notifier is triggered, try to update the scheduler")
 		var (
 			schedulersController   = c.coordinator.GetSchedulersController()
@@ -394,15 +410,29 @@ func (c *Cluster) runUpdateStoreStats() {
 	}
 }
 
+// runCoordinator runs the main scheduling loop.
+func (c *Cluster) runCoordinator() {
+	defer logutil.LogPanic()
+	defer c.wg.Done()
+	c.coordinator.RunUntilStop()
+}
+
 // StartBackgroundJobs starts background jobs.
 func (c *Cluster) StartBackgroundJobs() {
-	c.wg.Add(2)
+	c.wg.Add(3)
 	go c.updateScheduler()
 	go c.runUpdateStoreStats()
+	go c.runCoordinator()
+	c.running.Store(true)
 }
 
 // StopBackgroundJobs stops background jobs.
 func (c *Cluster) StopBackgroundJobs() {
+	if !c.running.Load() {
+		return
+	}
+	c.running.Store(false)
+	c.coordinator.Stop()
 	c.cancel()
 	c.wg.Wait()
 }

diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go
@@ -444,16 +444,12 @@ func (s *Server) startCluster(context.Context) error {
 	}
 	s.configWatcher.SetSchedulersController(s.cluster.GetCoordinator().GetSchedulersController())
 	s.cluster.StartBackgroundJobs()
-	go s.GetCoordinator().RunUntilStop()
 	return nil
 }
 
 func (s *Server) stopCluster() {
-	s.GetCoordinator().Stop()
 	s.cluster.StopBackgroundJobs()
-	s.ruleWatcher.Close()
-	s.configWatcher.Close()
-	s.metaWatcher.Close()
+	s.stopWatcher()
 }
 
 func (s *Server) startWatcher() (err error) {
@@ -469,6 +465,12 @@ func (s *Server) startWatcher() (err error) {
 	return err
 }
 
+func (s *Server) stopWatcher() {
+	s.ruleWatcher.Close()
+	s.configWatcher.Close()
+	s.metaWatcher.Close()
+}
+
 // GetPersistConfig returns the persist config.
 // It's used to test.
 func (s *Server) GetPersistConfig() *config.PersistConfig {

diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go
@@ -68,6 +68,8 @@ func NewController(ctx context.Context, cluster sche.SchedulerCluster, storage e
 
 // Wait waits on all schedulers to exit.
 func (c *Controller) Wait() {
+	c.Lock()
+	defer c.Unlock()
 	c.wg.Wait()
 }