Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move RollingUpdateOnReady to stable #2271

Merged
merged 4 commits into from
Oct 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ steps:
#

- name: 'e2e-runner'
args: ['PlayerTracking=true&SDKWatchSendOnExecute=false&RollingUpdateOnReady=false&NodeExternalDNS=true&StateAllocationFilter=true&PlayerAllocationFilter=true&CustomFasSyncInterval=true', 'e2e-test-cluster']
args: ['PlayerTracking=true&SDKWatchSendOnExecute=false&NodeExternalDNS=true&StateAllocationFilter=true&PlayerAllocationFilter=true&CustomFasSyncInterval=true', 'e2e-test-cluster']
id: e2e-feature-gates
waitFor:
- push-images
Expand Down
52 changes: 1 addition & 51 deletions pkg/fleets/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -603,57 +603,7 @@ func (c *Controller) rollingUpdateRestFixedOnReady(ctx context.Context, fleet *a

// rollingUpdateRest applies the rolling update to the inactive GameServerSets
func (c *Controller) rollingUpdateRest(ctx context.Context, fleet *agonesv1.Fleet, active *agonesv1.GameServerSet, rest []*agonesv1.GameServerSet) error {
if runtime.FeatureEnabled(runtime.FeatureRollingUpdateOnReady) {
return c.rollingUpdateRestFixedOnReady(ctx, fleet, active, rest)
}
return c.rollingUpdateRestBeforeFixOnReady(ctx, fleet, rest)
}

func (c *Controller) rollingUpdateRestBeforeFixOnReady(ctx context.Context, fleet *agonesv1.Fleet, rest []*agonesv1.GameServerSet) error {
if len(rest) == 0 {
return nil
}

r, err := intstr.GetValueFromIntOrPercent(fleet.Spec.Strategy.RollingUpdate.MaxUnavailable, int(fleet.Spec.Replicas), true)

if err != nil {
return errors.Wrapf(err, "error parsing MaxUnavailable value: %s", fleet.ObjectMeta.Name)
}
unavailable := int32(r)

for _, gsSet := range rest {

// if the status.Replicas are less than or equal to 0, then that means we are done
// scaling this GameServerSet down, and can therefore exit/move to the next one.
if gsSet.Status.Replicas <= 0 {
continue
}

// If the Spec.Replicas does not equal the Status.Replicas for this GameServerSet, this means
// that the rolling down process is currently ongoing, and we should therefore exit so we can wait for it to finish
if gsSet.Spec.Replicas != gsSet.Status.Replicas {
break
}
gsSetCopy := gsSet.DeepCopy()
newReplicasCount := fleet.LowerBoundReplicas(gsSetCopy.Spec.Replicas - unavailable)
if gsSet.Status.ShutdownReplicas == 0 {
gsSetCopy.Spec.Replicas = newReplicasCount
c.loggerForFleet(fleet).Debug(fmt.Sprintf("Replicas in a Shutdown state - %d", gsSet.Status.ShutdownReplicas))
c.loggerForFleet(fleet).WithField("gameserverset", gsSet.ObjectMeta.Name).WithField("replicas", gsSetCopy.Spec.Replicas).
Debug("applying rolling update to inactive gameserverset")

if _, err := c.gameServerSetGetter.GameServerSets(gsSetCopy.ObjectMeta.Namespace).Update(ctx, gsSetCopy, metav1.UpdateOptions{}); err != nil {
return errors.Wrapf(err, "error updating gameserverset %s", gsSetCopy.ObjectMeta.Name)
}
c.recorder.Eventf(fleet, corev1.EventTypeNormal, "ScalingGameServerSet",
"Scaling inactive GameServerSet %s from %d to %d", gsSetCopy.ObjectMeta.Name, gsSet.Spec.Replicas, gsSetCopy.Spec.Replicas)

// let's update just one at a time, slightly slower, but a simpler solution that doesn't require us
// to make sure we don't overshoot the amount that is being shutdown at any given point and time
break
}
}
return nil
return c.rollingUpdateRestFixedOnReady(ctx, fleet, active, rest)
}

// updateFleetStatus gets the GameServerSets for this Fleet and then
Expand Down
32 changes: 3 additions & 29 deletions pkg/fleets/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,6 @@ func TestControllerSyncFleet(t *testing.T) {
})

t.Run("gameserverset with different image details", func(t *testing.T) {
utilruntime.FeatureTestMutex.Lock()
defer utilruntime.FeatureTestMutex.Unlock()
assert.NoError(t, utilruntime.ParseFeatures(string(utilruntime.FeatureRollingUpdateOnReady)+"=true"))

f := defaultFixture()
f.Spec.Strategy.Type = appsv1.RollingUpdateDeploymentStrategyType
f.Spec.Template.Spec.Ports = []agonesv1.GameServerPort{{HostPort: 5555}}
Expand Down Expand Up @@ -1006,12 +1002,7 @@ func TestControllerRollingUpdateDeploymentGSSUpdateFailedErrExpected(t *testing.
assert.EqualError(t, err, "error updating gameserverset inactive: random-err")
}

func TestFeatureRollingUpdateOnReady(t *testing.T) {
utilruntime.FeatureTestMutex.Lock()
defer utilruntime.FeatureTestMutex.Unlock()

assert.NoError(t, utilruntime.ParseFeatures(string(utilruntime.FeatureRollingUpdateOnReady)+"=true"))

func TestRollingUpdateOnReady(t *testing.T) {
type expected struct {
inactiveSpecReplicas int32
replicas int32
Expand Down Expand Up @@ -1118,11 +1109,6 @@ func TestFeatureRollingUpdateOnReady(t *testing.T) {
func TestControllerRollingUpdateDeployment(t *testing.T) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is failing:

--- FAIL: TestControllerRollingUpdateDeployment (0.11s)
    --- FAIL: TestControllerRollingUpdateDeployment/attempt_to_drive_replicas_over_the_max_surge (0.02s)
        controller_test.go:1280: 
            	Error Trace:	controller_test.go:1280
            	            				fixture.go:521
            	            				fake.go:141
            	            				fake_gameserverset.go:98
            	            				controller.go:592
            	            				controller.go:606
            	            				controller.go:415
            	            				controller_test.go:1285
            	Error:      	Not equal: 
            	            	expected: 65
            	            	actual  : 45
            	Test:       	TestControllerRollingUpdateDeployment/attempt_to_drive_replicas_over_the_max_surge
    --- FAIL: TestControllerRollingUpdateDeployment/statuses_don't_match_the_spec._nothing_should_happen (0.00s)
        controller_test.go:1280: 
            	Error Trace:	controller_test.go:1280
            	            				fixture.go:521
            	            				fake.go:141
            	            				fake_gameserverset.go:98
            	            				controller.go:492
            	            				controller.go:537
            	            				controller.go:606
            	            				controller.go:415
            	            				controller_test.go:1285
            	Error:      	Not equal: 
            	            	expected: 15
            	            	actual  : 10
            	Test:       	TestControllerRollingUpdateDeployment/statuses_don't_match_the_spec._nothing_should_happen
        controller_test.go:1280: 
            	Error Trace:	controller_test.go:1280
            	            				fixture.go:521
            	            				fake.go:141
            	            				fake_gameserverset.go:98
            	            				controller.go:592
            	            				controller.go:606
            	            				controller.go:415
            	            				controller_test.go:1285
            	Error:      	Not equal: 
            	            	expected: 15
            	            	actual  : 0
            	Test:       	TestControllerRollingUpdateDeployment/statuses_don't_match_the_spec._nothing_should_happen
        controller_test.go:1292: 
            	Error Trace:	controller_test.go:1292
            	Error:      	Not equal: 
            	            	expected: false
            	            	actual  : true
            	Test:       	TestControllerRollingUpdateDeployment/statuses_don't_match_the_spec._nothing_should_happen
    --- FAIL: TestControllerRollingUpdateDeployment/test_smalled_numbers_of_active_and_allocated (0.00s)
        controller_test.go:1280: 
            	Error Trace:	controller_test.go:1280
            	            				fixture.go:521
            	            				fake.go:141
            	            				fake_gameserverset.go:98
            	            				controller.go:592
            	            				controller.go:606
            	            				controller.go:415
            	            				controller_test.go:1285
            	Error:      	Not equal: 
            	            	expected: 3
            	            	actual  : 4
            	Test:       	TestControllerRollingUpdateDeployment/test_smalled_numbers_of_active_and_allocated

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will rerun it locally and check the problem

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@roberthbailey I can reproduce the test failures in my env. Seems change in controller.go is straightforward. I didn't make other changes. Not sure why this test failed. Do you have any clues?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that these tests were aimed at when the feature was off assert.NoError(t, utilruntime.ParseFeatures(string(utilruntime.FeatureRollingUpdateOnReady)+"=false")), it's expected they would fail now it's enabled always.

So they would either need to be updated to the correct values, or removed entirely.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I took a quick look at these tests and it isn't immediately obvious to me what we are checking for. What test coverage are we losing if we delete the entire set of tests? Or just delete the 3 that are failing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This tests various gameserver set and rolling update deployment scenarios, and if they go beyond max surge, remove items as appropriate, etc.

Basically does rollingUpdateDeployment do as it should.

I think it's worth converting this over before moving to stable - also to ensure an extra layer that it is doing as expected.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense to me. @Jeffwan - we have about 1 more day to try and get this in before the 1.18 release, otherwise we can ship it with 1.19.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

em. Seem it's better to change to the appropriate values rather than removing them? Let me check algorithm today and see if I can make the changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I figure out the problem. I manually change to use c.rollingUpdateRestBeforeFixOnReady() they all pass. They do work for case utilruntime.FeatureRollingUpdateOnReady=false.

func (c *Controller) rollingUpdateRest(ctx context.Context, fleet *agonesv1.Fleet, active *agonesv1.GameServerSet, rest []*agonesv1.GameServerSet) error {
if runtime.FeatureEnabled(runtime.FeatureRollingUpdateOnReady) {
return c.rollingUpdateRestFixedOnReady(ctx, fleet, active, rest)
}
return c.rollingUpdateRestBeforeFixOnReady(ctx, fleet, rest)
}

Since we already use rollingUpdateRestFixedOnReady and remove c.rollingUpdateRestBeforeFixOnReady, these test need changes.

  1. I go though the logic and change right number in test cases "test smalled numbers of active and allocated" and "attempt to drive replicas over the max surge"

  2. I remove "statuses don't match the spec. nothing should happen", this is because update won' be called in the past c.rollingUpdateRestBeforeFixOnReady . However, this test case is not valid anymore because c.rollingUpdateRestFixedOnReady does consider active GSS.

@roberthbailey @markmandel Please have another check.

t.Parallel()

utilruntime.FeatureTestMutex.Lock()
defer utilruntime.FeatureTestMutex.Unlock()

assert.NoError(t, utilruntime.ParseFeatures(string(utilruntime.FeatureRollingUpdateOnReady)+"=false"))

type expected struct {
inactiveSpecReplicas int32
replicas int32
Expand Down Expand Up @@ -1197,23 +1183,11 @@ func TestControllerRollingUpdateDeployment(t *testing.T) {
inactiveSpecReplicas: 95,
inactiveStatusReplicas: 95,
expected: expected{
inactiveSpecReplicas: 65,
inactiveSpecReplicas: 45,
replicas: 30,
updated: true,
},
},
"statuses don't match the spec. nothing should happen": {
fleetSpecReplicas: 100,
activeSpecReplicas: 75,
activeStatusReplicas: 70,
inactiveSpecReplicas: 15,
inactiveStatusReplicas: 10,
expected: expected{
inactiveSpecReplicas: 15,
replicas: 75,
updated: false,
},
},
"test smalled numbers of active and allocated": {
fleetSpecReplicas: 5,
activeSpecReplicas: 0,
Expand All @@ -1223,7 +1197,7 @@ func TestControllerRollingUpdateDeployment(t *testing.T) {
inactiveStatusAllocationReplicas: 2,

expected: expected{
inactiveSpecReplicas: 3,
inactiveSpecReplicas: 4,
replicas: 2,
updated: true,
},
Expand Down
5 changes: 0 additions & 5 deletions pkg/util/runtime/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ const (
// FeatureSDKWatchSendOnExecute is a feature flag to enable/disable immediate game server return after SDK.WatchGameServer is called
FeatureSDKWatchSendOnExecute Feature = "SDKWatchSendOnExecute"

// FeatureRollingUpdateOnReady is a feature flag to enable/disable rolling update fix of scale down, when ReadyReplicas
// count is taken into account
FeatureRollingUpdateOnReady Feature = "RollingUpdateOnReady"

// NodeExternalDNS is a feature flag to enable/disable node ExternalDNS and InternalDNS use as GameServer address
NodeExternalDNS Feature = "NodeExternalDNS"

Expand All @@ -66,7 +62,6 @@ var (
FeatureExample: true,
FeaturePlayerTracking: false,
FeatureSDKWatchSendOnExecute: true,
FeatureRollingUpdateOnReady: true,
NodeExternalDNS: false,
FeatureStateAllocationFilter: false,
FeaturePlayerAllocationFilter: false,
Expand Down
1 change: 0 additions & 1 deletion site/content/en/docs/Guides/feature-stages.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ The current set of `alpha` and `beta` feature gates are:
| Example Gate (not in use) | `Example` | Disabled | None | 0.13.0 |
| [Player Tracking]({{< ref "/docs/Guides/player-tracking.md" >}}) | `PlayerTracking` | Disabled | `Alpha` | 1.6.0 |
| [SDK Send GameServer on Watch execution]({{< ref "/docs/Guides/Client SDKs/_index.md#watchgameserverfunctiongameserver" >}}) | `SDKWatchSendOnExecute` | Enabled | `Beta` | 1.12.0 |
| Fix for RollingUpdate [Scale down](https://github.com/googleforgames/agones/issues/1625) | `RollingUpdateOnReady` | Enabled | `Beta` | 1.14.0 |
| [Utilize Node ExternalDNS](https://github.com/googleforgames/agones/issues/1921) and additional [details]({{< ref "/docs/FAQ/_index.md" >}}) | `NodeExternalDNS` | Disabled | `Alpha` | 1.12.0 |
| [Custom resync period for FleetAutoscaler](https://github.com/googleforgames/agones/issues/1955) | `CustomFasSyncInterval` | Disabled | `Alpha` | 1.17.0 |
| [GameServer state filtering on GameServerAllocations](https://github.com/googleforgames/agones/issues/1239) | `StateAllocationFilter` | Disabled | `Alpha` | 1.14.0 |
Expand Down
19 changes: 5 additions & 14 deletions test/e2e/fleet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,20 +297,14 @@ func TestFleetRollingUpdate(t *testing.T) {
maxSurge, err := intstr.GetValueFromIntOrPercent(flt.Spec.Strategy.RollingUpdate.MaxSurge, int(flt.Spec.Replicas), true)
assert.Nil(t, err)

roundUp := true
if runtime.FeatureEnabled(runtime.FeatureRollingUpdateOnReady) {
roundUp = false
}
roundUp := false
maxUnavailable, err := intstr.GetValueFromIntOrPercent(flt.Spec.Strategy.RollingUpdate.MaxUnavailable, int(flt.Spec.Replicas), roundUp)

shift := 0
if runtime.FeatureEnabled(runtime.FeatureRollingUpdateOnReady) {
if maxUnavailable == 0 {
maxUnavailable = 1
}
// This difference is inevitable, also could be seen with Deployments and ReplicaSets
shift = maxUnavailable
if maxUnavailable == 0 {
maxUnavailable = 1
}
// This difference is inevitable, also could be seen with Deployments and ReplicaSets
shift := maxUnavailable
assert.Nil(t, err)

expectedTotal := targetScale + maxSurge + maxUnavailable + shift
Expand Down Expand Up @@ -349,9 +343,6 @@ func TestFleetRollingUpdate(t *testing.T) {
}

func TestUpdateFleetReplicaAndSpec(t *testing.T) {
if !runtime.FeatureEnabled(runtime.FeatureRollingUpdateOnReady) {
t.SkipNow()
}
t.Parallel()

client := framework.AgonesClient.AgonesV1()
Expand Down