Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mark GameServer Unhealthy if allocated HostPort isn't available #408

Merged
merged 2 commits into from
Nov 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
package gameservers

import (
"strings"

"agones.dev/agones/pkg/apis/stable"
"agones.dev/agones/pkg/apis/stable/v1alpha1"
"agones.dev/agones/pkg/client/clientset/versioned"
Expand All @@ -39,8 +41,8 @@ import (
)

// HealthController watches Pods, and applies
// an Unhealthy state if the GameServer main container exits when in
// a Ready state
// an Unhealthy state if certain pods crash, or can't be assigned a port, and other
// similar type conditions.
type HealthController struct {
logger *logrus.Entry
podSynced cache.InformerSynced
Expand Down Expand Up @@ -75,9 +77,8 @@ func NewHealthController(kubeClient kubernetes.Interface, agonesClient versioned
UpdateFunc: func(oldObj, newObj interface{}) {
pod := newObj.(*corev1.Pod)
if owner := metav1.GetControllerOf(pod); owner != nil && owner.Kind == "GameServer" {
if v1alpha1.GameServerRolePodSelector.Matches(labels.Set(pod.Labels)) && hc.failedContainer(pod) {
if v1alpha1.GameServerRolePodSelector.Matches(labels.Set(pod.Labels)) && hc.isUnhealthy(pod) {
key := pod.ObjectMeta.Namespace + "/" + owner.Name
hc.logger.WithField("key", key).Info("GameServer container has terminated")
hc.workerqueue.Enqueue(cache.ExplicitKey(key))
}
}
Expand All @@ -86,6 +87,25 @@ func NewHealthController(kubeClient kubernetes.Interface, agonesClient versioned
return hc
}

// isUnhealthy returns if the Pod event is going
// to cause the GameServer to become Unhealthy
func (hc *HealthController) isUnhealthy(pod *corev1.Pod) bool {
return hc.unschedulableWithNoFreePorts(pod) || hc.failedContainer(pod)
}

// unschedulableWithNoFreePorts checks if the reason the Pod couldn't be scheduled
// was because there weren't any free ports in the range specified
func (hc *HealthController) unschedulableWithNoFreePorts(pod *corev1.Pod) bool {
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodScheduled && cond.Reason == corev1.PodReasonUnschedulable {
if strings.Contains(cond.Message, "free ports") {
return true
}
}
}
return false
}

// failedContainer checks each container, and determines if there was a failed
// container
func (hc *HealthController) failedContainer(pod *corev1.Pod) bool {
Expand All @@ -96,7 +116,6 @@ func (hc *HealthController) failedContainer(pod *corev1.Pod) bool {
}
}
return false

}

// Run processes the rate limited queue.
Expand Down Expand Up @@ -126,7 +145,23 @@ func (hc *HealthController) syncGameServer(key string) error {
return errors.Wrapf(err, "error retrieving GameServer %s from namespace %s", name, namespace)
}

if gs.Status.State == v1alpha1.Ready {
var reason string
unhealthy := false

switch gs.Status.State {

case v1alpha1.Starting:
hc.logger.WithField("key", key).Info("GameServer cannot start on this port")
unhealthy = true
reason = "No nodes have free ports for the allocated ports"

case v1alpha1.Ready:
hc.logger.WithField("key", key).Info("GameServer container has terminated")
unhealthy = true
reason = "GameServer container terminated"
}

if unhealthy {
hc.logger.WithField("gs", gs).Infof("Marking GameServer as Unhealthy")
gsCopy := gs.DeepCopy()
gsCopy.Status.State = v1alpha1.Unhealthy
Expand All @@ -135,7 +170,7 @@ func (hc *HealthController) syncGameServer(key string) error {
return errors.Wrapf(err, "error updating GameServer %s to unhealthy", gs.ObjectMeta.Name)
}

hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), "GameServer container terminated")
hc.recorder.Event(gs, corev1.EventTypeWarning, string(gsCopy.Status.State), reason)
}

return nil
Expand Down
59 changes: 56 additions & 3 deletions pkg/gameservers/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,28 @@ func TestHealthControllerFailedContainer(t *testing.T) {
assert.False(t, hc.failedContainer(pod2))
}

func TestHealthUnschedulableWithNoFreePorts(t *testing.T) {
t.Parallel()

m := agtesting.NewMocks()
hc := NewHealthController(m.KubeClient, m.AgonesClient, m.KubeInformationFactory, m.AgonesInformerFactory)

gs := v1alpha1.GameServer{ObjectMeta: v1.ObjectMeta{Name: "test"}, Spec: newSingleContainerSpec()}
gs.ApplyDefaults()

pod, err := gs.Pod()
assert.Nil(t, err)

pod.Status.Conditions = []corev1.PodCondition{
{Type: corev1.PodScheduled, Reason: corev1.PodReasonUnschedulable,
Message: "0/4 nodes are available: 4 node(s) didn't have free ports for the requestedpod ports."},
}
assert.True(t, hc.unschedulableWithNoFreePorts(pod))

pod.Status.Conditions[0].Message = "not a real reason"
assert.False(t, hc.unschedulableWithNoFreePorts(pod))
}

func TestHealthControllerSyncGameServer(t *testing.T) {
t.Parallel()

Expand All @@ -63,11 +85,18 @@ func TestHealthControllerSyncGameServer(t *testing.T) {
state v1alpha1.State
expected expected
}{
"not ready": {
"started": {
state: v1alpha1.Starting,
expected: expected{
updated: true,
state: v1alpha1.Unhealthy,
},
},
"scheduled": {
state: v1alpha1.Scheduled,
expected: expected{
updated: false,
state: v1alpha1.Starting,
state: v1alpha1.Scheduled,
},
},
"ready": {
Expand All @@ -83,6 +112,7 @@ func TestHealthControllerSyncGameServer(t *testing.T) {
t.Run(name, func(t *testing.T) {
m := agtesting.NewMocks()
hc := NewHealthController(m.KubeClient, m.AgonesClient, m.KubeInformationFactory, m.AgonesInformerFactory)
hc.recorder = m.FakeRecorder

gs := v1alpha1.GameServer{ObjectMeta: v1.ObjectMeta{Namespace: "default", Name: "test"}, Spec: newSingleContainerSpec(),
Status: v1alpha1.GameServerStatus{State: test.state}}
Expand Down Expand Up @@ -126,8 +156,11 @@ func TestHealthControllerRun(t *testing.T) {
m.KubeClient.AddWatchReactor("pods", k8stesting.DefaultWatchReactor(podWatch, nil))

updated := make(chan bool)
defer close(updated)
m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
defer close(updated)
defer func() {
updated <- true
}()
ua := action.(k8stesting.UpdateAction)
gsObj := ua.GetObject().(*v1alpha1.GameServer)
assert.Equal(t, v1alpha1.Unhealthy, gsObj.Status.State)
Expand All @@ -151,6 +184,26 @@ func TestHealthControllerRun(t *testing.T) {
pod.Status.ContainerStatuses = []corev1.ContainerStatus{{Name: gs.Spec.Container, State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{}}}}
// gate
assert.True(t, hc.failedContainer(pod))
assert.False(t, hc.unschedulableWithNoFreePorts(pod))

podWatch.Modify(pod)

select {
case <-updated:
case <-time.After(10 * time.Second):
assert.FailNow(t, "timeout on GameServer update")
}

agtesting.AssertEventContains(t, m.FakeRecorder.Events, string(v1alpha1.Unhealthy))

pod.Status.ContainerStatuses = nil
pod.Status.Conditions = []corev1.PodCondition{
{Type: corev1.PodScheduled, Reason: corev1.PodReasonUnschedulable,
Message: "0/4 nodes are available: 4 node(s) didn't have free ports for the requestedpod ports."},
}
// gate
assert.True(t, hc.unschedulableWithNoFreePorts(pod))
assert.False(t, hc.failedContainer(pod))

podWatch.Modify(pod)

Expand Down
26 changes: 26 additions & 0 deletions test/e2e/gameserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,32 @@ func TestSDKSetAnnotation(t *testing.T) {
assert.NotEmpty(t, gs.ObjectMeta.Annotations["stable.agones.dev/sdk-timestamp"])
}

func TestUnhealthyGameServersWithoutFreePorts(t *testing.T) {
t.Parallel()
nodes, err := framework.KubeClient.CoreV1().Nodes().List(metav1.ListOptions{})
assert.Nil(t, err)

// gate
assert.True(t, len(nodes.Items) > 0)

gs := defaultGameServer()
gs.Spec.Ports[0].HostPort = 7515
gs.Spec.Ports[0].PortPolicy = v1alpha1.Static

gameServers := framework.AgonesClient.StableV1alpha1().GameServers(defaultNs)

for range nodes.Items {
_, err := gameServers.Create(gs.DeepCopy())
assert.Nil(t, err)
}

newGs, err := gameServers.Create(gs.DeepCopy())
assert.Nil(t, err)

_, err = framework.WaitForGameServerState(newGs, v1alpha1.Unhealthy, 10*time.Second)
assert.Nil(t, err)
}

func defaultGameServer() *v1alpha1.GameServer {
gs := &v1alpha1.GameServer{ObjectMeta: metav1.ObjectMeta{GenerateName: "udp-server", Namespace: defaultNs},
Spec: v1alpha1.GameServerSpec{
Expand Down