Skip to content

Commit

Permalink
Flaky: TestGameServerRestartBeforeReadyCrash (#1174)
Browse files Browse the repository at this point in the history
There was a race condition in which if a game server container crashes,
and then restarts and moves to ready, the crashed game server
container id could be the recorded container that was added as an
Annotation to the GameServer, which would is incorrect.

This fix has a check in place to ensure the game server container id
that is being added as the Annotation value, is from an actual running
container.

Closes #1173
  • Loading branch information
markmandel authored Nov 7, 2019
1 parent a05bf26 commit 72523e2
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 7 deletions.
5 changes: 5 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,11 @@ func (c *Controller) syncGameServerRequestReadyState(gs *agonesv1.GameServer) (*
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == gs.Spec.Container {
if _, ok := gs.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok {
// check to make sure this container is actually running. If there was a recent crash, the cache may
// not yet have the newer, running container.
if cs.State.Running == nil {
return nil, errors.New("game server container is not currently running, try again")
}
gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = cs.ContainerID
}
break
Expand Down
51 changes: 46 additions & 5 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,7 @@ func TestControllerApplyGameServerAddressAndPort(t *testing.T) {

func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
t.Parallel()
nodeName := "node"
containerID := "1234"

t.Run("GameServer with ReadyRequest State", func(t *testing.T) {
Expand All @@ -927,10 +928,14 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
gsFixture := &agonesv1.GameServer{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: newSingleContainerSpec(), Status: agonesv1.GameServerStatus{State: agonesv1.GameServerStateRequestReady}}
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = "node"
gsFixture.Status.NodeName = nodeName
pod, err := gsFixture.Pod()
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
{
Name: gsFixture.Spec.Container,
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
ContainerID: containerID,
},
}
assert.Nil(t, err)
gsUpdated := false
Expand Down Expand Up @@ -967,7 +972,11 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
assert.Nil(t, err)
pod.Spec.NodeName = nodeFixtureName
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
{
Name: gsFixture.Spec.Container,
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
ContainerID: containerID,
},
}
gsUpdated := false

Expand Down Expand Up @@ -1011,11 +1020,15 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
gsFixture := &agonesv1.GameServer{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: newSingleContainerSpec(), Status: agonesv1.GameServerStatus{State: agonesv1.GameServerStateRequestReady}}
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = "node"
gsFixture.Status.NodeName = nodeName
gsFixture.Annotations[agonesv1.GameServerReadyContainerIDAnnotation] = "4321"
pod, err := gsFixture.Pod()
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{Name: gsFixture.Spec.Container, ContainerID: containerID},
{
Name: gsFixture.Spec.Container,
State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
ContainerID: containerID,
},
}
assert.Nil(t, err)
gsUpdated := false
Expand Down Expand Up @@ -1052,6 +1065,34 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
})
}

t.Run("GameServer whose pod is currently not in a running state, so should retry and not update", func(t *testing.T) {
c, m := newFakeController()

gsFixture := &agonesv1.GameServer{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: newSingleContainerSpec(), Status: agonesv1.GameServerStatus{State: agonesv1.GameServerStateRequestReady}}
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = nodeName
pod, err := gsFixture.Pod()
pod.Status.ContainerStatuses = []corev1.ContainerStatus{{Name: gsFixture.Spec.Container}}
assert.Nil(t, err)
gsUpdated := false

m.KubeClient.AddReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &corev1.PodList{Items: []corev1.Pod{*pod}}, nil
})
m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
gsUpdated = true
return true, nil, nil
})

_, cancel := agtesting.StartInformers(m, c.podSynced)
defer cancel()

_, err = c.syncGameServerRequestReadyState(gsFixture)
assert.EqualError(t, err, "game server container is not currently running, try again")
assert.False(t, gsUpdated, "GameServer was updated")
})

t.Run("GameServer with non zero deletion datetime", func(t *testing.T) {
testWithNonZeroDeletionTimestamp(t, func(c *Controller, fixture *agonesv1.GameServer) (*agonesv1.GameServer, error) {
return c.syncGameServerRequestReadyState(fixture)
Expand Down
2 changes: 1 addition & 1 deletion pkg/gameservers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func (hc *HealthController) skipUnhealthy(gs *agonesv1.GameServer) (bool, error)
return hc.failedContainer(pod), nil
}

// finally, we need to check if there a failed container happened after the gameserver was ready or before.
// finally, we need to check if the failed container happened after the gameserver was ready or before.
for _, cs := range pod.Status.ContainerStatuses {
if cs.Name == gs.Spec.Container {
if cs.State.Terminated != nil {
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/gameserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ func TestGameServerRestartBeforeReadyCrash(t *testing.T) {
return false
})
if err != nil {
assert.FailNowf(t, "Could not make GameServer Ready: %v", err.Error())
assert.FailNowf(t, "Could not make GameServer Ready", "reason: %v", err.Error())
}
// now crash, should be unhealthy, since it's after being Ready
logger.Info("crashing again, should be unhealthy")
Expand Down

0 comments on commit 72523e2

Please sign in to comment.