Skip to content

Commit

Permalink
Advanced deployment scale down old unhealthy pods firstly (#150)
Browse files Browse the repository at this point in the history
* advanced deployment scale down old unhealthy pods firstly

Signed-off-by: mingzhou.swx <mingzhou.swx@alibaba-inc.com>

* add e2e for advanced deployment scale down old unhealthy pod first

Signed-off-by: mingzhou.swx <mingzhou.swx@alibaba-inc.com>

---------

Signed-off-by: mingzhou.swx <mingzhou.swx@alibaba-inc.com>
Co-authored-by: mingzhou.swx <mingzhou.swx@alibaba-inc.com>
  • Loading branch information
veophi and mingzhou.swx committed Jul 12, 2023
1 parent e99c529 commit 72e1c0b
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
24 changes: 24 additions & 0 deletions pkg/controller/deployment/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,30 @@ func (dc *DeploymentController) scale(ctx context.Context, deployment *apps.Depl
// replica sets.
deploymentReplicasToAdd := allowedSize - allRSsReplicas

// Scale down the unhealthy replicas in old replica sets firstly to avoid some bad cases.
// For example:
// _______________________________________________________________________________________
// | ReplicaSet | oldRS-1 | oldRS-2 | newRS |
// | --------------| -------------------|----------------------|--------------------------|
// | Replicas | 4 healthy Pods | 2 unhealthy Pods | 4 unhealthy Pods |
// ---------------------------------------------------------------------------------------
// If we want to scale down these replica sets from 10 to 6, we expect to scale down the oldRS-2
// from 2 to 0 firstly, then scale down oldRS-1 1 Pod and newRS 1 Pod based on proportion.
//
// We do not scale down the newRS unhealthy Pods with higher priority, because these new revision
// Pods may be just created, not the one with the crash or other problems.
var err error
var cleanupCount int32
if deploymentReplicasToAdd < 0 {
oldRSs, cleanupCount, err = dc.cleanupUnhealthyReplicas(ctx, oldRSs, deployment, -deploymentReplicasToAdd)
if err != nil {
return err
}
klog.V(4).Infof("Cleaned up unhealthy replicas from old RSes by %d during scaling", cleanupCount)
deploymentReplicasToAdd += cleanupCount
allRSs = deploymentutil.FilterActiveReplicaSets(append(oldRSs, newRS))
}

// The additional replicas should be distributed proportionally amongst the active
// replica sets from the larger to the smaller in size replica set. Scaling direction
// drives what happens in case we are trying to scale replica sets of the same size.
Expand Down
21 changes: 19 additions & 2 deletions test/e2e/deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ var _ = SIGDescribe("Advanced Deployment", func() {
return k8sClient.Get(context.TODO(), key, object)
}

UpdateDeployment := func(deployment *apps.Deployment, version string) *apps.Deployment {
UpdateDeployment := func(deployment *apps.Deployment, version string, images ...string) *apps.Deployment {
By(fmt.Sprintf("update deployment %v to version: %v", client.ObjectKeyFromObject(deployment), version))
var clone *apps.Deployment
Expect(retry.RetryOnConflict(defaultRetry, func() error {
Expand All @@ -53,7 +53,11 @@ var _ = SIGDescribe("Advanced Deployment", func() {
if err != nil {
return err
}
clone.Spec.Template.Spec.Containers[0].Image = deployment.Spec.Template.Spec.Containers[0].Image
if len(images) == 0 {
clone.Spec.Template.Spec.Containers[0].Image = deployment.Spec.Template.Spec.Containers[0].Image
} else {
clone.Spec.Template.Spec.Containers[0].Image = images[0]
}
clone.Spec.Template.Spec.Containers[0].Env[0].Value = version
strategy := unmarshal(clone.Annotations[rolloutsv1alpha1.DeploymentStrategyAnnotation])
strategy.Paused = true
Expand Down Expand Up @@ -324,6 +328,19 @@ var _ = SIGDescribe("Advanced Deployment", func() {
UpdatePartitionWithCheck(deployment, intstr.FromInt(3))
UpdatePartitionWithCheck(deployment, intstr.FromInt(5))
})

It("scale down old unhealthy first", func() {
deployment := &apps.Deployment{}
deployment.Namespace = namespace
Expect(ReadYamlToObject("./test_data/deployment/deployment.yaml", deployment)).ToNot(HaveOccurred())
deployment.Annotations["rollouts.kruise.io/deployment-strategy"] = `{"rollingUpdate":{"maxUnavailable":0,"maxSurge":1}}`
CreateObject(deployment)
UpdateDeployment(deployment, "version2", "busybox:not-exists")
UpdatePartitionWithoutCheck(deployment, intstr.FromInt(1))
CheckReplicas(deployment, 6, 5, 1)
UpdateDeployment(deployment, "version3", "busybox:1.32")
CheckReplicas(deployment, 5, 5, 0)
})
})
})

Expand Down

0 comments on commit 72e1c0b

Please sign in to comment.