fix multiple overflow errors in exponential backoff

We use capped exponential backoff in several places in the code when handling failures. The code we've copy-and-pasted all over has a check to see if the backoff is greater than the limit, but this check happens after the bitshift and we always increment the number of attempts. This causes an overflow with a fairly small number of failures (ex. at one place I tested it occurs after only 24 iterations), resulting in a negative backoff which then never recovers. The backoff becomes a tight loop consuming resources and/or DoS'ing a Nomad RPC handler or an external API such as Vault. Note this doesn't occur in places where we cap the number of iterations so the loop breaks (usually to return an error), so long as the number of iterations is reasonable. Introduce a check on the cap before the bitshift to avoid overflow in all places this can occur. Fixes: #18199
hashicorp · Aug 14, 2023 · c326e0a · c326e0a
1 parent a170499
commit c326e0a
Show file tree

Hide file tree

Showing 10 changed files with 73 additions and 44 deletions.
diff --git a/.changelog/18200.txt b/.changelog/18200.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+core: Fixed a bug where exponential backoff could result in excessive CPU usage
+```
diff --git a/client/allocrunner/taskrunner/stats_hook.go b/client/allocrunner/taskrunner/stats_hook.go
@@ -128,6 +128,8 @@ MAIN:
 // It logs the errors with appropriate log levels; don't log returned error
 func (h *statsHook) callStatsWithRetry(ctx context.Context, handle interfaces.DriverStats) (<-chan *cstructs.TaskResourceUsage, error) {
 	var retry int
+	backoff := time.Duration(0)
+	limit := time.Second * 5
 
 MAIN:
 	if ctx.Err() != nil {
@@ -162,10 +164,11 @@ MAIN:
 		h.logger.Error("failed to start stats collection for task", "error", err)
 	}
 
-	limit := time.Second * 5
-	backoff := 1 << (2 * uint64(retry)) * time.Second
-	if backoff > limit || retry > 5 {
-		backoff = limit
+	if backoff < limit {
+		backoff = 1 << (2 * uint64(retry)) * time.Second
+		if backoff > limit || retry > 5 {
+			backoff = limit
+		}
 	}
 
 	// Increment retry counter

diff --git a/client/allocrunner/taskrunner/vault_hook.go b/client/allocrunner/taskrunner/vault_hook.go
@@ -312,6 +312,7 @@ OUTER:
 // returns the Vault token and whether the manager should exit.
 func (h *vaultHook) deriveVaultToken() (token string, exit bool) {
 	attempts := 0
+	backoff := time.Duration(0)
 	for {
 		tokens, err := h.client.DeriveToken(h.alloc, []string{h.taskName})
 		if err == nil {
@@ -339,14 +340,15 @@ func (h *vaultHook) deriveVaultToken() (token string, exit bool) {
 		}
 
 		// Handle the retry case
-		backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
-		if backoff > vaultBackoffLimit {
-			backoff = vaultBackoffLimit
+		if backoff < vaultBackoffLimit {
+			backoff = (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
+			if backoff > vaultBackoffLimit {
+				backoff = vaultBackoffLimit
+			}
+			attempts++
 		}
 		h.logger.Error("failed to derive Vault token", "error", err, "recoverable", true, "backoff", backoff)
 
-		attempts++
-
 		// Wait till retrying
 		select {
 		case <-h.ctx.Done():

diff --git a/client/devicemanager/instance.go b/client/devicemanager/instance.go
@@ -470,6 +470,7 @@ START:
 
 	var sresp *device.StatsResponse
 	var ok bool
+	backoff := time.Duration(0)
 	for {
 		select {
 		case <-i.ctx.Done():
@@ -494,12 +495,14 @@ START:
 				goto START
 			}
 
-			// Retry with an exponential backoff
-			backoff := (1 << (2 * uint64(attempt))) * statsBackoffBaseline
-			if backoff > statsBackoffLimit {
-				backoff = statsBackoffLimit
+			// Retry with an exponential backoff, avoiding overflow
+			if backoff < statsBackoffLimit {
+				backoff = (1 << (2 * uint64(attempt))) * statsBackoffBaseline
+				if backoff > statsBackoffLimit {
+					backoff = statsBackoffLimit
+				}
+				attempt++
 			}
-			attempt++
 
 			i.logger.Error("stats returned an error", "error", err, "retry", backoff)
 

diff --git a/client/pluginmanager/drivermanager/instance.go b/client/pluginmanager/drivermanager/instance.go
@@ -329,12 +329,13 @@ func (i *instanceManager) fingerprint() {
 				i.handleFingerprintError()
 
 				// Calculate the new backoff
-				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
-				if backoff > driverFPBackoffLimit {
-					backoff = driverFPBackoffLimit
+				if backoff < driverFPBackoffLimit {
+					backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
+					if backoff > driverFPBackoffLimit {
+						backoff = driverFPBackoffLimit
+					}
+					retry++
 				}
-				// Increment retry counter
-				retry++
 				continue
 			}
 			cancel()
@@ -453,11 +454,13 @@ func (i *instanceManager) handleEvents() {
 				i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry)
 
 				// Calculate the new backoff
-				backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
-				if backoff > driverFPBackoffLimit {
-					backoff = driverFPBackoffLimit
+				if backoff < driverFPBackoffLimit {
+					backoff = (1 << (2 * uint64(retry))) * driverFPBackoffBaseline
+					if backoff > driverFPBackoffLimit {
+						backoff = driverFPBackoffLimit
+					}
+					retry++
 				}
-				retry++
 				continue
 			}
 			cancel()

diff --git a/command/agent/consul/version_checker.go b/command/agent/consul/version_checker.go
@@ -27,6 +27,7 @@ func checkConsulTLSSkipVerify(ctx context.Context, logger log.Logger, client Age
 
 	timer, stop := helper.NewSafeTimer(limit)
 	defer stop()
+	backoff := time.Duration(0)
 
 	for {
 		self, err := client.Self()
@@ -40,11 +41,13 @@ func checkConsulTLSSkipVerify(ctx context.Context, logger log.Logger, client Age
 			return
 		}
 
-		backoff := (1 << (2 * i)) * baseline
-		if backoff > limit {
-			backoff = limit
-		} else {
-			i++
+		if backoff < limit {
+			backoff = (1 << (2 * i)) * baseline
+			if backoff > limit {
+				backoff = limit
+			} else {
+				i++
+			}
 		}
 
 		timer.Reset(backoff)

diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go
@@ -572,7 +572,8 @@ START:
 	return recoverableErrTimeouts(startErr)
 }
 
-// nextBackoff returns appropriate docker backoff durations after attempted attempts.
+// nextBackoff returns appropriate docker backoff durations after attempted
+// attempts. Note that the caller must cap attempted so that it doesn't overflow
 func nextBackoff(attempted int) time.Duration {
 	// attempts in 200ms, 800ms, 3.2s, 12.8s, 51.2s
 	// TODO: add randomization factor and extract to a helper

diff --git a/drivers/docker/stats.go b/drivers/docker/stats.go
@@ -137,12 +137,13 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte
 			h.logger.Debug("error collecting stats from container", "error", err)
 
 			// Calculate the new backoff
-			backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline
-			if backoff > statsCollectorBackoffLimit {
-				backoff = statsCollectorBackoffLimit
+			if backoff < statsCollectorBackoffLimit {
+				backoff = (1 << (2 * uint64(retry))) * statsCollectorBackoffBaseline
+				if backoff > statsCollectorBackoffLimit {
+					backoff = statsCollectorBackoffLimit
+				}
+				retry++
 			}
-			// Increment retry counter
-			retry++
 			continue
 		}
 		// Stats finished either because context was canceled, doneCh was closed

diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
@@ -256,6 +256,7 @@ func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*State
 	const backoffLimit = 1 * time.Second
 	var retries uint
 	var retryTimer *time.Timer
+	deadline := time.Duration(0)
 
 	// XXX: Potential optimization is to set up a watch on the state
 	// store's index table and only unblock via a trigger rather than
@@ -279,9 +280,11 @@ func (s *StateStore) SnapshotMinIndex(ctx context.Context, index uint64) (*State
 			retryTimer = time.NewTimer(backoffBase)
 		} else {
 			// Subsequent retry, reset timer
-			deadline := 1 << (2 * retries) * backoffBase
-			if deadline > backoffLimit {
-				deadline = backoffLimit
+			if deadline < backoffLimit {
+				deadline = 1 << (2 * retries) * backoffBase
+				if deadline > backoffLimit {
+					deadline = backoffLimit
+				}
 			}
 			retryTimer.Reset(deadline)
 		}

diff --git a/nomad/worker.go b/nomad/worker.go
@@ -109,8 +109,9 @@ type Worker struct {
 
 	// failures is the count of errors encountered while dequeueing evaluations
 	// and is used to calculate backoff.
-	failures  uint
-	evalToken string
+	failures       uint
+	failureBackoff time.Duration
+	evalToken      string
 
 	// snapshotIndex is the index of the snapshot in which the scheduler was
 	// first invoked. It is used to mark the SnapshotIndex of evaluations
@@ -133,6 +134,7 @@ func newWorker(ctx context.Context, srv *Server, args SchedulerWorkerPoolArgs) *
 		start:             time.Now(),
 		status:            WorkerStarting,
 		enabledSchedulers: make([]string, len(args.EnabledSchedulers)),
+		failureBackoff:    time.Duration(0),
 	}
 	copy(w.enabledSchedulers, args.EnabledSchedulers)
 
@@ -874,12 +876,17 @@ func (w *Worker) shouldResubmit(err error) bool {
 // backoff if the server or the worker is shutdown.
 func (w *Worker) backoffErr(base, limit time.Duration) bool {
 	w.setWorkloadStatus(WorkloadBackoff)
-	backoff := (1 << (2 * w.failures)) * base
-	if backoff > limit {
-		backoff = limit
-	} else {
-		w.failures++
+
+	backoff := w.failureBackoff
+	if w.failureBackoff < limit {
+		backoff = (1 << (2 * w.failures)) * base
+		if backoff > limit {
+			backoff = limit
+		} else {
+			w.failures++
+		}
 	}
+
 	select {
 	case <-time.After(backoff):
 		return false