-
Notifications
You must be signed in to change notification settings - Fork 828
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix for Pod deletion during unavailable controller (#1279)
* Fix for Pod deletion during unavailable controller If a Pod gets deleted, especially during GameServer Ready or Allocated state, and the controller is either crashed, missing or unable to access master, when the controller comes back up, the GameServer is left in a zombie state in which it could be Allocated, but there is no Pod process to back it. Ideally, scenarios like this shouldn't happen, but it is possible, depending on user interaction with Kubernetes, so we should cover the scenario, as it requires manual intervention to fix otherwise. This PR implements a controller that periodically checks GameServers to ensure they have backing Pods, such that if this happens the GameServer is marked as Unhealthy, and a Fleet can eventually return to a healed, stable state, and not require manual intervention. Closes #1170 Closes #398 (especially combined with fix for #1245)
- Loading branch information
1 parent
5319b13
commit bb05ab0
Showing
7 changed files
with
524 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
// Copyright 2020 Google LLC All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package gameservers | ||
|
||
import ( | ||
"agones.dev/agones/pkg/apis/agones" | ||
agonesv1 "agones.dev/agones/pkg/apis/agones/v1" | ||
"agones.dev/agones/pkg/client/clientset/versioned" | ||
"agones.dev/agones/pkg/client/clientset/versioned/scheme" | ||
getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" | ||
"agones.dev/agones/pkg/client/informers/externalversions" | ||
listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" | ||
"agones.dev/agones/pkg/util/logfields" | ||
"agones.dev/agones/pkg/util/runtime" | ||
"agones.dev/agones/pkg/util/workerqueue" | ||
"github.com/heptiolabs/healthcheck" | ||
"github.com/pkg/errors" | ||
"github.com/sirupsen/logrus" | ||
corev1 "k8s.io/api/core/v1" | ||
k8serrors "k8s.io/apimachinery/pkg/api/errors" | ||
"k8s.io/client-go/informers" | ||
"k8s.io/client-go/kubernetes" | ||
typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" | ||
corelisterv1 "k8s.io/client-go/listers/core/v1" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/client-go/tools/record" | ||
) | ||
|
||
// MissingPodController makes sure that any GameServer | ||
// that isn't in a Scheduled or Unhealthy state and is missing a Pod is | ||
// moved to Unhealthy. | ||
// | ||
// It's possible that a GameServer is missing its associated pod due to | ||
// unexpected controller downtime or if the Pod is deleted with no subsequent Delete event. | ||
// | ||
// Since resync on the controller is every 30 seconds, even if there is some time in which a GameServer | ||
// is in a broken state, it will eventually move to Unhealthy, and get replaced (if in a Fleet). | ||
type MissingPodController struct { | ||
baseLogger *logrus.Entry | ||
podSynced cache.InformerSynced | ||
podLister corelisterv1.PodLister | ||
gameServerSynced cache.InformerSynced | ||
gameServerGetter getterv1.GameServersGetter | ||
gameServerLister listerv1.GameServerLister | ||
workerqueue *workerqueue.WorkerQueue | ||
recorder record.EventRecorder | ||
} | ||
|
||
// NewMissingPodController returns a MissingPodController | ||
func NewMissingPodController(health healthcheck.Handler, | ||
kubeClient kubernetes.Interface, | ||
agonesClient versioned.Interface, | ||
kubeInformerFactory informers.SharedInformerFactory, | ||
agonesInformerFactory externalversions.SharedInformerFactory) *MissingPodController { | ||
podInformer := kubeInformerFactory.Core().V1().Pods().Informer() | ||
gameServers := agonesInformerFactory.Agones().V1().GameServers() | ||
|
||
c := &MissingPodController{ | ||
podSynced: podInformer.HasSynced, | ||
podLister: kubeInformerFactory.Core().V1().Pods().Lister(), | ||
gameServerSynced: gameServers.Informer().HasSynced, | ||
gameServerGetter: agonesClient.AgonesV1(), | ||
gameServerLister: gameServers.Lister(), | ||
} | ||
|
||
c.baseLogger = runtime.NewLoggerWithType(c) | ||
c.workerqueue = workerqueue.NewWorkerQueue(c.syncGameServer, c.baseLogger, logfields.GameServerKey, agones.GroupName+".MissingPodController") | ||
health.AddLivenessCheck("gameserver-missing-pod-workerqueue", healthcheck.Check(c.workerqueue.Healthy)) | ||
|
||
eventBroadcaster := record.NewBroadcaster() | ||
eventBroadcaster.StartLogging(c.baseLogger.Debugf) | ||
eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) | ||
c.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "missing-pod-controller"}) | ||
|
||
gameServers.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||
UpdateFunc: func(_, newObj interface{}) { | ||
gs := newObj.(*agonesv1.GameServer) | ||
if _, isDev := gs.GetDevAddress(); !isDev && !isBeforePodCreated(gs) && !gs.IsBeingDeleted() && | ||
!(gs.Status.State == agonesv1.GameServerStateUnhealthy) { | ||
c.workerqueue.Enqueue(gs) | ||
} | ||
}, | ||
}) | ||
|
||
return c | ||
} | ||
|
||
// Run processes the rate limited queue. | ||
// Will block until stop is closed | ||
func (c *MissingPodController) Run(stop <-chan struct{}) error { | ||
c.baseLogger.Debug("Wait for cache sync") | ||
if !cache.WaitForCacheSync(stop, c.gameServerSynced, c.podSynced) { | ||
return errors.New("failed to wait for caches to sync") | ||
} | ||
|
||
c.workerqueue.Run(1, stop) | ||
return nil | ||
} | ||
|
||
func (c *MissingPodController) loggerForGameServerKey(key string) *logrus.Entry { | ||
return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerKey, key) | ||
} | ||
|
||
// syncGameServer checks if a GameServer has a backing Pod, and if not, | ||
// moves it to Unhealthy | ||
func (c *MissingPodController) syncGameServer(key string) error { | ||
namespace, name, err := cache.SplitMetaNamespaceKey(key) | ||
if err != nil { | ||
// don't return an error, as we don't want this retried | ||
runtime.HandleError(c.loggerForGameServerKey(key), errors.Wrapf(err, "invalid resource key")) | ||
return nil | ||
} | ||
|
||
// check if the pod exists | ||
if pod, err := c.podLister.Pods(namespace).Get(name); err != nil { | ||
if !k8serrors.IsNotFound(err) { | ||
return errors.Wrapf(err, "error retrieving Pod %s from namespace %s", name, namespace) | ||
} | ||
} else if isGameServerPod(pod) { | ||
// if the pod exists, all is well, and we can continue on our merry way. | ||
return nil | ||
} | ||
c.loggerForGameServerKey(key).Debug("Pod is missing. Moving GameServer to Unhealthy.") | ||
|
||
gs, err := c.gameServerLister.GameServers(namespace).Get(name) | ||
if err != nil { | ||
if k8serrors.IsNotFound(err) { | ||
c.loggerForGameServerKey(key).Debug("GameServer is no longer available for syncing") | ||
return nil | ||
} | ||
return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace) | ||
} | ||
|
||
// already on the way out, so no need to do anything. | ||
if gs.IsBeingDeleted() || gs.Status.State == agonesv1.GameServerStateUnhealthy { | ||
c.loggerForGameServerKey(key).WithField("state", gs.Status.State).Debug("GameServer already being deleted/unhealthy. Skipping.") | ||
return nil | ||
} | ||
|
||
gsCopy := gs.DeepCopy() | ||
gsCopy.Status.State = agonesv1.GameServerStateUnhealthy | ||
gs, err = c.gameServerGetter.GameServers(gsCopy.ObjectMeta.Namespace).Update(gsCopy) | ||
if err != nil { | ||
return errors.Wrap(err, "error updating GameServer to Unhealthy") | ||
} | ||
|
||
c.recorder.Event(gs, corev1.EventTypeWarning, string(gs.Status.State), "Pod is missing") | ||
return nil | ||
} |
Oops, something went wrong.