Skip to content

Commit

Permalink
Revert #3070, wait on networking a different way
Browse files Browse the repository at this point in the history
The problem addressed by #3070 is that on an indeterminate basis, we
are seeing containers start without networking fully available. Once
networking seems to work, it works fine.

However, the fix in #3070 introduced a downside: heavy watch traffic,
because I didn't quite understand that it would also block the hanging
GET of the watch. See #3106.

Instead of timing out the whole client, let's use an initial-probe
approach and instead block on a successful GET (with a reasonable
timeout) before we try to start informers.

Fixes #3106
  • Loading branch information
zmerlynn committed Apr 17, 2023
1 parent 18d1e8f commit c40fe48
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
7 changes: 0 additions & 7 deletions cmd/sdk-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,6 @@ func main() {
if err != nil {
logger.WithError(err).Fatal("Could not create in cluster config")
}
// The SDK client only ever accesses small amounts of data (single object list /
// event updates), latency more than a couple of seconds is excessive. We need to
// keep a relatively tight timeout during initialization as well to allow the
// informer a chance to retry - the SDK won't reply to /healthz checks until the
// informer has synced once, and our liveness configuration only allows 9s before
// a positive /healthz.
config.Timeout = 3 * time.Second

var kubeClient *kubernetes.Clientset
kubeClient, err = kubernetes.NewForConfig(config)
Expand Down
30 changes: 30 additions & 0 deletions pkg/sdkserver/sdkserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf
// Run processes the rate limited queue.
// Will block until stop is closed
func (s *SDKServer) Run(ctx context.Context) error {
if err := s.waitForConnection(ctx); err != nil {
return err
}
s.informerFactory.Start(ctx.Done())
if !cache.WaitForCacheSync(ctx.Done(), s.gameServerSynced) {
return errors.New("failed to wait for caches to sync")
Expand Down Expand Up @@ -263,6 +266,33 @@ func (s *SDKServer) Run(ctx context.Context) error {
return nil
}

// waitForConnection attempts a GameServer GET every 3s until the client responds.
// This is a workaround for the informer hanging indefinitely on first LIST due
// to a flaky network to the Kubernetes service endpoint.
func (s *SDKServer) waitForConnection(ctx context.Context) error {
getWithDeadline := func(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()

// Specifically use gameServerGetter since it's the raw client (gameServerLister is the informer).
// We use List here to avoid needing permission to Get().
_, err := s.gameServerGetter.GameServers(s.namespace).List(ctx, metav1.ListOptions{
FieldSelector: fields.OneTermEqualSelector("metadata.name", s.gameServerName).String(),
})
return err
}

for try := 0; ctx.Err() == nil; try++ {
err := getWithDeadline(ctx)
if err == nil {
s.logger.WithField("try", try).Info("Kubernetes connection established")
return nil
}
s.logger.WithField("try", try).WithError(err).Info("Kubernetes connection failed")
}
return ctx.Err()
}

// syncGameServer synchronises the GameServer with the requested operations.
// The format of the key is {operation}. To prevent old operation data from
// overwriting the new one, the operation data is persisted in SDKServer.
Expand Down

0 comments on commit c40fe48

Please sign in to comment.