From 243a869079f66032f700d96159a7bc8b5e2015f8 Mon Sep 17 00:00:00 2001 From: Mark Mandel Date: Wed, 7 Nov 2018 11:45:55 -0800 Subject: [PATCH] Mark GameServer Unhealthy if allocated HostPort isn't available This marks a GameServer as unhealthy is the allocated port isn't available on a given Node for whatever reason. Since it's marked as Unhealthy, the Fleet controller will delete it, and the new GameServer will be allocated a new random port that it deems open and available. I've yet to see this in the wild, but I had mild concerns, so figured I'd fix it now rather than later. --- pkg/gameservers/health.go | 49 ++++++++++++++++++++++++---- pkg/gameservers/health_test.go | 59 ++++++++++++++++++++++++++++++++-- test/e2e/gameserver_test.go | 26 +++++++++++++++ 3 files changed, 124 insertions(+), 10 deletions(-) diff --git a/pkg/gameservers/health.go b/pkg/gameservers/health.go index 451ffc4ad8..2affbc83e5 100644 --- a/pkg/gameservers/health.go +++ b/pkg/gameservers/health.go @@ -15,6 +15,8 @@ package gameservers import ( + "strings" + "agones.dev/agones/pkg/apis/stable" "agones.dev/agones/pkg/apis/stable/v1alpha1" "agones.dev/agones/pkg/client/clientset/versioned" @@ -39,8 +41,8 @@ import ( ) // HealthController watches Pods, and applies -// an Unhealthy state if the GameServer main container exits when in -// a Ready state +// an Unhealthy state if certain pods crash, or can't be assigned a port, and other +// similar type conditions. type HealthController struct { logger *logrus.Entry podSynced cache.InformerSynced @@ -75,9 +77,8 @@ func NewHealthController(kubeClient kubernetes.Interface, agonesClient versioned UpdateFunc: func(oldObj, newObj interface{}) { pod := newObj.(*corev1.Pod) if owner := metav1.GetControllerOf(pod); owner != nil && owner.Kind == "GameServer" { - if v1alpha1.GameServerRolePodSelector.Matches(labels.Set(pod.Labels)) && hc.failedContainer(pod) { + if v1alpha1.GameServerRolePodSelector.Matches(labels.Set(pod.Labels)) && hc.isUnhealthy(pod) { key := pod.ObjectMeta.Namespace + "/" + owner.Name - hc.logger.WithField("key", key).Info("GameServer container has terminated") hc.workerqueue.Enqueue(cache.ExplicitKey(key)) } } @@ -86,6 +87,25 @@ func NewHealthController(kubeClient kubernetes.Interface, agonesClient versioned return hc } +// isUnhealthy returns if the Pod event is going +// to cause the GameServer to become Unhealthy +func (hc *HealthController) isUnhealthy(pod *corev1.Pod) bool { + return hc.unschedulableWithNoFreePorts(pod) || hc.failedContainer(pod) +} + +// unschedulableWithNoFreePorts checks if the reason the Pod couldn't be scheduled +// was because there weren't any free ports in the range specified +func (hc *HealthController) unschedulableWithNoFreePorts(pod *corev1.Pod) bool { + for _, cond := range pod.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Reason == corev1.PodReasonUnschedulable { + if strings.Contains(cond.Message, "free ports") { + return true + } + } + } + return false +} + // failedContainer checks each container, and determines if there was a failed // container func (hc *HealthController) failedContainer(pod *corev1.Pod) bool { @@ -96,7 +116,6 @@ func (hc *HealthController) failedContainer(pod *corev1.Pod) bool { } } return false - } // Run processes the rate limited queue. @@ -126,7 +145,23 @@ func (hc *HealthController) syncGameServer(key string) error { return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace) } - if gs.Status.State == v1alpha1.Ready { + var reason string + unhealthy := false + + switch gs.Status.State { + + case v1alpha1.Starting: + hc.logger.WithField("key", key).Info("GameServer cannot start on this port") + unhealthy = true + reason = "No nodes have free ports for the allocated ports" + + case v1alpha1.Ready: + hc.logger.WithField("key", key).Info("GameServer container has terminated") + unhealthy = true + reason = "GameServer container terminated" + } + + if unhealthy { hc.logger.WithField("gs", gs).Infof("Marking GameServer as Unhealthy") gsCopy := gs.DeepCopy() gsCopy.Status.State = v1alpha1.Unhealthy @@ -135,7 +170,7 @@ func (hc *HealthController) syncGameServer(key string) error { return errors.Wrapf(err, "error updating GameServer %s to unhealthy", gs.ObjectMeta.Name) } - hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), "GameServer container terminated") + hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), reason) } return nil diff --git a/pkg/gameservers/health_test.go b/pkg/gameservers/health_test.go index 21c1ff16eb..617ca8cd81 100644 --- a/pkg/gameservers/health_test.go +++ b/pkg/gameservers/health_test.go @@ -52,6 +52,28 @@ func TestHealthControllerFailedContainer(t *testing.T) { assert.False(t, hc.failedContainer(pod2)) } +func TestHealthUnschedulableWithNoFreePorts(t *testing.T) { + t.Parallel() + + m := agtesting.NewMocks() + hc := NewHealthController(m.KubeClient, m.AgonesClient, m.KubeInformationFactory, m.AgonesInformerFactory) + + gs := v1alpha1.GameServer{ObjectMeta: v1.ObjectMeta{Name: "test"}, Spec: newSingleContainerSpec()} + gs.ApplyDefaults() + + pod, err := gs.Pod() + assert.Nil(t, err) + + pod.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodScheduled, Reason: corev1.PodReasonUnschedulable, + Message: "0/4 nodes are available: 4 node(s) didn't have free ports for the requestedpod ports."}, + } + assert.True(t, hc.unschedulableWithNoFreePorts(pod)) + + pod.Status.Conditions[0].Message = "not a real reason" + assert.False(t, hc.unschedulableWithNoFreePorts(pod)) +} + func TestHealthControllerSyncGameServer(t *testing.T) { t.Parallel() @@ -63,11 +85,18 @@ func TestHealthControllerSyncGameServer(t *testing.T) { state v1alpha1.State expected expected }{ - "not ready": { + "started": { state: v1alpha1.Starting, + expected: expected{ + updated: true, + state: v1alpha1.Unhealthy, + }, + }, + "scheduled": { + state: v1alpha1.Scheduled, expected: expected{ updated: false, - state: v1alpha1.Starting, + state: v1alpha1.Scheduled, }, }, "ready": { @@ -83,6 +112,7 @@ func TestHealthControllerSyncGameServer(t *testing.T) { t.Run(name, func(t *testing.T) { m := agtesting.NewMocks() hc := NewHealthController(m.KubeClient, m.AgonesClient, m.KubeInformationFactory, m.AgonesInformerFactory) + hc.recorder = m.FakeRecorder gs := v1alpha1.GameServer{ObjectMeta: v1.ObjectMeta{Namespace: "default", Name: "test"}, Spec: newSingleContainerSpec(), Status: v1alpha1.GameServerStatus{State: test.state}} @@ -126,8 +156,11 @@ func TestHealthControllerRun(t *testing.T) { m.KubeClient.AddWatchReactor("pods", k8stesting.DefaultWatchReactor(podWatch, nil)) updated := make(chan bool) + defer close(updated) m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) { - defer close(updated) + defer func() { + updated <- true + }() ua := action.(k8stesting.UpdateAction) gsObj := ua.GetObject().(*v1alpha1.GameServer) assert.Equal(t, v1alpha1.Unhealthy, gsObj.Status.State) @@ -151,6 +184,26 @@ func TestHealthControllerRun(t *testing.T) { pod.Status.ContainerStatuses = []corev1.ContainerStatus{{Name: gs.Spec.Container, State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{}}}} // gate assert.True(t, hc.failedContainer(pod)) + assert.False(t, hc.unschedulableWithNoFreePorts(pod)) + + podWatch.Modify(pod) + + select { + case <-updated: + case <-time.After(10 * time.Second): + assert.FailNow(t, "timeout on GameServer update") + } + + agtesting.AssertEventContains(t, m.FakeRecorder.Events, string(v1alpha1.Unhealthy)) + + pod.Status.ContainerStatuses = nil + pod.Status.Conditions = []corev1.PodCondition{ + {Type: corev1.PodScheduled, Reason: corev1.PodReasonUnschedulable, + Message: "0/4 nodes are available: 4 node(s) didn't have free ports for the requestedpod ports."}, + } + // gate + assert.True(t, hc.unschedulableWithNoFreePorts(pod)) + assert.False(t, hc.failedContainer(pod)) podWatch.Modify(pod) diff --git a/test/e2e/gameserver_test.go b/test/e2e/gameserver_test.go index 38dcd1ea5a..891ebefacd 100644 --- a/test/e2e/gameserver_test.go +++ b/test/e2e/gameserver_test.go @@ -115,6 +115,32 @@ func TestSDKSetAnnotation(t *testing.T) { assert.NotEmpty(t, gs.ObjectMeta.Annotations["stable.agones.dev/sdk-timestamp"]) } +func TestUnhealthyGameServersWithoutFreePorts(t *testing.T) { + t.Parallel() + nodes, err := framework.KubeClient.CoreV1().Nodes().List(metav1.ListOptions{}) + assert.Nil(t, err) + + // gate + assert.True(t, len(nodes.Items) > 0) + + gs := defaultGameServer() + gs.Spec.Ports[0].HostPort = 7515 + gs.Spec.Ports[0].PortPolicy = v1alpha1.Static + + gameServers := framework.AgonesClient.StableV1alpha1().GameServers(defaultNs) + + for range nodes.Items { + _, err := gameServers.Create(gs.DeepCopy()) + assert.Nil(t, err) + } + + newGs, err := gameServers.Create(gs.DeepCopy()) + assert.Nil(t, err) + + _, err = framework.WaitForGameServerState(newGs, v1alpha1.Unhealthy, 10*time.Second) + assert.Nil(t, err) +} + func defaultGameServer() *v1alpha1.GameServer { gs := &v1alpha1.GameServer{ObjectMeta: metav1.ObjectMeta{GenerateName: "udp-server", Namespace: defaultNs}, Spec: v1alpha1.GameServerSpec{