Restructure to use channels instead of a pull model

Signed-off-by: Vince Prignano <vincepri@vmware.com>
kubernetes-sigs · Nov 3, 2021 · 423ec74 · 423ec74
1 parent fe8076b
commit 423ec74
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 147 deletions.
diff --git a/pkg/manager/internal.go b/pkg/manager/internal.go
@@ -189,7 +189,7 @@ func (cm *controllerManager) add(r Runnable) error {
 	if err := cm.SetFields(r); err != nil {
 		return err
 	}
-	return cm.runnables.Add(r, nil)
+	return cm.runnables.Add(r)
 }
 
 // Deprecated: use the equivalent Options field to set a field. This method will be removed in v0.10.
@@ -216,7 +216,7 @@ func (cm *controllerManager) AddMetricsExtraHandler(path string, handler http.Ha
 	defer cm.Unlock()
 
 	if cm.started {
-		return fmt.Errorf("uunable to add new metrics handler because metrics endpoint has already been created")
+		return fmt.Errorf("unable to add new metrics handler because metrics endpoint has already been created")
 	}
 
 	if path == defaultMetricsEndpoint {
@@ -456,21 +456,21 @@ func (cm *controllerManager) Start(ctx context.Context) (err error) {
 	// WARNING: Webhooks MUST start before any cache is populated, otherwise there is a race condition
 	// between conversion webhooks and the cache sync (usually initial list) which causes the webhooks
 	// to never start because no cache can be populated.
-	if err := cm.runnables.Webhooks.StartAndWaitReady(cm.internalCtx); err != nil {
+	if err := cm.runnables.Webhooks.Start(cm.internalCtx); err != nil {
 		if err != wait.ErrWaitTimeout {
 			return err
 		}
 	}
 
 	// Start and wait for caches.
-	if err := cm.runnables.Caches.StartAndWaitReady(cm.internalCtx); err != nil {
+	if err := cm.runnables.Caches.Start(cm.internalCtx); err != nil {
 		if err != wait.ErrWaitTimeout {
 			return err
 		}
 	}
 
 	// Start the non-leaderelection Runnables after the cache has synced.
-	if err := cm.runnables.Others.StartAndWaitReady(cm.internalCtx); err != nil {
+	if err := cm.runnables.Others.Start(cm.internalCtx); err != nil {
 		if err != wait.ErrWaitTimeout {
 			return err
 		}
@@ -601,7 +601,7 @@ func (cm *controllerManager) engageStopProcedure(stopComplete <-chan struct{}) e
 }
 
 func (cm *controllerManager) startLeaderElectionRunnables() error {
-	return cm.runnables.LeaderElection.StartAndWaitReady(cm.internalCtx)
+	return cm.runnables.LeaderElection.Start(cm.internalCtx)
 }
 
 func (cm *controllerManager) startLeaderElection(ctx context.Context) (err error) {

diff --git a/pkg/manager/manager_test.go b/pkg/manager/manager_test.go
@@ -1817,9 +1817,6 @@ func (c *startSignalingInformer) Start(ctx context.Context) error {
 
 func (c *startSignalingInformer) WaitForCacheSync(ctx context.Context) bool {
 	defer func() {
-		for !c.started() {
-			continue
-		}
 		c.mu.Lock()
 		c.wasSynced = true
 		c.mu.Unlock()

diff --git a/pkg/manager/runnable_group.go b/pkg/manager/runnable_group.go
@@ -4,9 +4,7 @@ import (
 	"context"
 	"errors"
 	"sync"
-	"time"
 
-	"k8s.io/apimachinery/pkg/util/wait"
 	"sigs.k8s.io/controller-runtime/pkg/webhook"
 )
 
@@ -18,8 +16,8 @@ var (
 // a ready check.
 type readyRunnable struct {
 	Runnable
-	Check runnableCheck
-	Ready bool
+	Check       runnableCheck
+	signalReady bool
 }
 
 // runnableCheck can be passed to Add() to let the runnable group determine that a
@@ -46,33 +44,27 @@ func newRunnables(errChan chan error) *runnables {
 	}
 }
 
-// Add adds a runnable and its ready check to the closest
-// group of runnable that they belong to.
+// Add adds a runnable to closest group of runnable that they belong to.
 //
 // Add should be able to be called before and after Start, but not after StopAndWait.
 // Add should return an error when called during StopAndWait.
 // The runnables added before Start are started when Start is called.
 // The runnables added after Start are started directly.
-func (r *runnables) Add(fn Runnable, ready runnableCheck) error {
-	if ready == nil {
-		// If we don't have a readiness check, always return true.
-		ready = func(_ context.Context) bool { return true }
-	}
-
+func (r *runnables) Add(fn Runnable) error {
 	switch runnable := fn.(type) {
 	case hasCache:
 		return r.Caches.Add(fn, func(ctx context.Context) bool {
-			return ready(ctx) && runnable.GetCache().WaitForCacheSync(ctx)
+			return runnable.GetCache().WaitForCacheSync(ctx)
 		})
 	case *webhook.Server:
-		return r.Webhooks.Add(fn, ready)
+		return r.Webhooks.Add(fn, nil)
 	case LeaderElectionRunnable:
 		if !runnable.NeedLeaderElection() {
-			return r.Others.Add(fn, ready)
+			return r.Others.Add(fn, nil)
 		}
-		return r.LeaderElection.Add(fn, ready)
+		return r.LeaderElection.Add(fn, nil)
 	default:
-		return r.LeaderElection.Add(fn, ready)
+		return r.LeaderElection.Add(fn, nil)
 	}
 }
 
@@ -85,9 +77,11 @@ type runnableGroup struct {
 	ctx    context.Context
 	cancel context.CancelFunc
 
-	start     sync.Mutex
-	startOnce sync.Once
-	started   bool
+	start        sync.Mutex
+	startOnce    sync.Once
+	started      bool
+	startQueue   []*readyRunnable
+	startReadyCh chan *readyRunnable
 
 	stop     sync.RWMutex
 	stopOnce sync.Once
@@ -104,23 +98,14 @@ type runnableGroup struct {
 	// wg is an internal sync.WaitGroup that allows us to properly stop
 	// and wait for all the runnables to finish before returning.
 	wg *sync.WaitGroup
-
-	// group is a sync.Map that contains every runnable ever.
-	// The key of the map is the runnable itself (key'd by pointer),
-	// while the value is its ready state.
-	//
-	// The group of runnable is append-only, runnables scheduled
-	// through this group are going to be stored in this internal map
-	// until the application exits. The limit is the available memory.
-	group *sync.Map
 }
 
 func newRunnableGroup(errChan chan error) *runnableGroup {
 	r := &runnableGroup{
-		errChan: errChan,
-		ch:      make(chan *readyRunnable),
-		wg:      new(sync.WaitGroup),
-		group:   new(sync.Map),
+		startReadyCh: make(chan *readyRunnable),
+		errChan:      errChan,
+		ch:           make(chan *readyRunnable),
+		wg:           new(sync.WaitGroup),
 	}
 	r.ctx, r.cancel = context.WithCancel(context.Background())
 	return r
@@ -133,25 +118,57 @@ func (r *runnableGroup) Started() bool {
 	return r.started
 }
 
-// StartAndWaitReady starts all the runnables previously
-// added to the group and waits for all to report ready.
-func (r *runnableGroup) StartAndWaitReady(ctx context.Context) error {
-	r.Start()
-	return r.WaitReady(ctx)
-}
+// Start starts the group and waits for all
+// initially registered runnables to start.
+// It can only be called once, subsequent calls have no effect.
+func (r *runnableGroup) Start(ctx context.Context) error {
+	var retErr error
 
-// Start starts the group, it can only be called once.
-func (r *runnableGroup) Start() {
 	r.startOnce.Do(func() {
+		defer close(r.startReadyCh)
+
+		// Start the internal reconciler.
 		go r.reconcile()
+
+		// Start the group and queue up all
+		// the runnables that were added prior.
 		r.start.Lock()
 		r.started = true
-		r.group.Range(func(key, _ interface{}) bool {
-			r.ch <- key.(*readyRunnable)
-			return true
-		})
+		for _, rn := range r.startQueue {
+			rn.signalReady = true
+			r.ch <- rn
+		}
 		r.start.Unlock()
+
+		// If we don't have any queue, return.
+		if len(r.startQueue) == 0 {
+			return
+		}
+
+		// Wait for all runnables to signal.
+		for {
+			select {
+			case <-ctx.Done():
+				if err := ctx.Err(); !errors.Is(err, context.Canceled) {
+					retErr = err
+				}
+			case rn := <-r.startReadyCh:
+				for i, existing := range r.startQueue {
+					if existing == rn {
+						// Remove the item from the start queue.
+						r.startQueue = append(r.startQueue[:i], r.startQueue[i+1:]...)
+						break
+					}
+				}
+				// We're done waiting if the queue is empty, return.
+				if len(r.startQueue) == 0 {
+					return
+				}
+			}
+		}
 	})
+
+	return retErr
 }
 
 // reconcile is our main entrypoint for every runnable added
@@ -185,26 +202,17 @@ func (r *runnableGroup) reconcile() {
 		go func(rn *readyRunnable) {
 			go func() {
 				if rn.Check(r.ctx) {
-					r.group.Store(rn, true)
+					if rn.signalReady {
+						r.startReadyCh <- rn
+					}
 				}
 			}()
 
 			// If we return, the runnable ended cleanly
 			// or returned an error to the channel.
 			//
-			// We should always decrement the WaitGroup and
-			// mark the runnable as ready.
-			//
-			// Think about the group as an append-only system.
-			//
-			// A runnable is marked as ready if:
-			// - The health check return true.
-			// - The runnable Start() method returned and
-			//	 it either finished cleanly (e.g. one shot operations)
-			//   or it failed to run and it returned an error which
-			//   gets propagated to the manager.
+			// We should always decrement the WaitGroup here.
 			defer r.wg.Done()
-			defer r.group.Store(rn, true)
 
 			// Start the runnable.
 			if err := rn.Start(r.ctx); err != nil {
@@ -214,27 +222,6 @@ func (r *runnableGroup) reconcile() {
 	}
 }
 
-// WaitReady polls until the group is ready or until the context is cancelled.
-func (r *runnableGroup) WaitReady(ctx context.Context) error {
-	return wait.PollImmediateInfiniteWithContext(ctx,
-		100*time.Millisecond,
-		func(_ context.Context) (bool, error) {
-			if !r.Started() {
-				return false, nil
-			}
-			ready, total := 0, 0
-			r.group.Range(func(_, value interface{}) bool {
-				total++
-				if rd, ok := value.(bool); ok && rd {
-					ready++
-				}
-				return true
-			})
-			return ready == total, nil
-		},
-	)
-}
-
 // Add should be able to be called before and after Start, but not after StopAndWait.
 // Add should return an error when called during StopAndWait.
 func (r *runnableGroup) Add(rn Runnable, ready runnableCheck) error {
@@ -261,11 +248,10 @@ func (r *runnableGroup) Add(rn Runnable, ready runnableCheck) error {
 	{
 		r.start.Lock()
 
-		// Store the runnable in the internal buffer.
-		r.group.Store(readyRunnable, false)
-
 		// Check if we're already started.
 		if !r.started {
+			// Store the runnable in the internal if not.
+			r.startQueue = append(r.startQueue, readyRunnable)
 			r.start.Unlock()
 			return nil
 		}
@@ -283,7 +269,7 @@ func (r *runnableGroup) StopAndWait(ctx context.Context) {
 		// Close the reconciler channel once we're done.
 		defer close(r.ch)
 
-		r.Start()
+		_ = r.Start(ctx)
 		r.stop.Lock()
 		// Store the stopped variable so we don't accept any new
 		// runnables for the time being.