Skip to content

Commit

Permalink
Improve fleet controller response times in busy clusters. (#1108)
Browse files Browse the repository at this point in the history
In a cluster with significant number of fleets (> workers in Fleet
controller) and many configuration updates in flight,
it is possible for change notifications to be queued with
ever-increasing exponential back-off (up to 1000 seconds).

Because fleet controller listens for both user-triggered changes and
GSS changes, it can get really busy when allocations are in flight,
leading to slower and slower fleet reaction times to both scaling and
reflecting GSS status.

When this happens and a user script scales a fleet,
it can take multiple minutes for Fleet controller to respond.

This change caps the queueing time for each fleet at 3 seconds to avoid
spinning CPU cycles for really busy GSSs, but still give reasonable
latency for user-triggered changes.

Fixes #1107
  • Loading branch information
jkowalski committed Oct 14, 2019
1 parent 0c0ab85 commit e633559
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pkg/fleetautoscalers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func NewController(
fleetAutoscalerSynced: autoscaler.Informer().HasSynced,
}
c.baseLogger = runtime.NewLoggerWithType(c)
c.workerqueue = workerqueue.NewWorkerQueue(c.syncFleetAutoscaler, c.baseLogger, logfields.FleetAutoscalerKey, autoscaling.GroupName+".FleetAutoscalerController")
c.workerqueue = workerqueue.NewWorkerQueueWithRateLimiter(c.syncFleetAutoscaler, c.baseLogger, logfields.FleetAutoscalerKey, autoscaling.GroupName+".FleetAutoscalerController", workerqueue.FastRateLimiter(3*time.Second))
health.AddLivenessCheck("fleetautoscaler-workerqueue", healthcheck.Check(c.workerqueue.Healthy))

eventBroadcaster := record.NewBroadcaster()
Expand Down
3 changes: 2 additions & 1 deletion pkg/fleets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"encoding/json"
"fmt"
"reflect"
"time"

"agones.dev/agones/pkg/apis/agones"
agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
Expand Down Expand Up @@ -89,7 +90,7 @@ func NewController(
}

c.baseLogger = runtime.NewLoggerWithType(c)
c.workerqueue = workerqueue.NewWorkerQueue(c.syncFleet, c.baseLogger, logfields.FleetKey, agones.GroupName+".FleetController")
c.workerqueue = workerqueue.NewWorkerQueueWithRateLimiter(c.syncFleet, c.baseLogger, logfields.FleetKey, agones.GroupName+".FleetController", workerqueue.FastRateLimiter(3*time.Second))
health.AddLivenessCheck("fleet-workerqueue", healthcheck.Check(c.workerqueue.Healthy))

eventBroadcaster := record.NewBroadcaster()
Expand Down
3 changes: 2 additions & 1 deletion pkg/gameserversets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package gameserversets
import (
"encoding/json"
"sync"
"time"

"agones.dev/agones/pkg/apis"
"agones.dev/agones/pkg/apis/agones"
Expand Down Expand Up @@ -109,7 +110,7 @@ func NewController(
}

c.baseLogger = runtime.NewLoggerWithType(c)
c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServerSet, c.baseLogger, logfields.GameServerSetKey, agones.GroupName+".GameServerSetController")
c.workerqueue = workerqueue.NewWorkerQueueWithRateLimiter(c.syncGameServerSet, c.baseLogger, logfields.GameServerSetKey, agones.GroupName+".GameServerSetController", workerqueue.FastRateLimiter(3*time.Second))
health.AddLivenessCheck("gameserverset-workerqueue", healthcheck.Check(c.workerqueue.Healthy))

eventBroadcaster := record.NewBroadcaster()
Expand Down
8 changes: 8 additions & 0 deletions pkg/util/workerqueue/workerqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ type WorkerQueue struct {
running int
}

// FastRateLimiter returns a rate limiter without exponential back-off, with specified maximum per-item retry delay.
func FastRateLimiter(maxDelay time.Duration) workqueue.RateLimiter {
const numFastRetries = 5
const fastDelay = 200 * time.Millisecond // first few retries up to 'numFastRetries' are fast

return workqueue.NewItemFastSlowRateLimiter(fastDelay, maxDelay, numFastRetries)
}

// NewWorkerQueue returns a new worker queue for a given name
func NewWorkerQueue(handler Handler, logger *logrus.Entry, keyName logfields.ResourceType, queueName string) *WorkerQueue {
return NewWorkerQueueWithRateLimiter(handler, logger, keyName, queueName, workqueue.DefaultControllerRateLimiter())
Expand Down

0 comments on commit e633559

Please sign in to comment.