From c40fe480fc1f05365f19ded2ef3b2a8e9f860e35 Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Mon, 17 Apr 2023 22:16:04 +0000 Subject: [PATCH] Revert #3070, wait on networking a different way The problem addressed by #3070 is that on an indeterminate basis, we are seeing containers start without networking fully available. Once networking seems to work, it works fine. However, the fix in #3070 introduced a downside: heavy watch traffic, because I didn't quite understand that it would also block the hanging GET of the watch. See #3106. Instead of timing out the whole client, let's use an initial-probe approach and instead block on a successful GET (with a reasonable timeout) before we try to start informers. Fixes #3106 --- cmd/sdk-server/main.go | 7 ------- pkg/sdkserver/sdkserver.go | 30 ++++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/cmd/sdk-server/main.go b/cmd/sdk-server/main.go index 8627a1221b..9a54646dbd 100644 --- a/cmd/sdk-server/main.go +++ b/cmd/sdk-server/main.go @@ -122,13 +122,6 @@ func main() { if err != nil { logger.WithError(err).Fatal("Could not create in cluster config") } - // The SDK client only ever accesses small amounts of data (single object list / - // event updates), latency more than a couple of seconds is excessive. We need to - // keep a relatively tight timeout during initialization as well to allow the - // informer a chance to retry - the SDK won't reply to /healthz checks until the - // informer has synced once, and our liveness configuration only allows 9s before - // a positive /healthz. - config.Timeout = 3 * time.Second var kubeClient *kubernetes.Clientset kubeClient, err = kubernetes.NewForConfig(config) diff --git a/pkg/sdkserver/sdkserver.go b/pkg/sdkserver/sdkserver.go index b751b6033b..840c9fe880 100644 --- a/pkg/sdkserver/sdkserver.go +++ b/pkg/sdkserver/sdkserver.go @@ -190,6 +190,9 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf // Run processes the rate limited queue. // Will block until stop is closed func (s *SDKServer) Run(ctx context.Context) error { + if err := s.waitForConnection(ctx); err != nil { + return err + } s.informerFactory.Start(ctx.Done()) if !cache.WaitForCacheSync(ctx.Done(), s.gameServerSynced) { return errors.New("failed to wait for caches to sync") @@ -263,6 +266,33 @@ func (s *SDKServer) Run(ctx context.Context) error { return nil } +// waitForConnection attempts a GameServer GET every 3s until the client responds. +// This is a workaround for the informer hanging indefinitely on first LIST due +// to a flaky network to the Kubernetes service endpoint. +func (s *SDKServer) waitForConnection(ctx context.Context) error { + getWithDeadline := func(ctx context.Context) error { + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + // Specifically use gameServerGetter since it's the raw client (gameServerLister is the informer). + // We use List here to avoid needing permission to Get(). + _, err := s.gameServerGetter.GameServers(s.namespace).List(ctx, metav1.ListOptions{ + FieldSelector: fields.OneTermEqualSelector("metadata.name", s.gameServerName).String(), + }) + return err + } + + for try := 0; ctx.Err() == nil; try++ { + err := getWithDeadline(ctx) + if err == nil { + s.logger.WithField("try", try).Info("Kubernetes connection established") + return nil + } + s.logger.WithField("try", try).WithError(err).Info("Kubernetes connection failed") + } + return ctx.Err() +} + // syncGameServer synchronises the GameServer with the requested operations. // The format of the key is {operation}. To prevent old operation data from // overwriting the new one, the operation data is persisted in SDKServer.