Skip to content

Commit

Permalink
Merge pull request #4729 from chaosi-zju/hpav5
Browse files Browse the repository at this point in the history
fix deployment replicas syncer in case that `status.replicas` haven't been collected from member cluster to template
  • Loading branch information
karmada-bot committed Mar 27, 2024
2 parents db15f17 + 31d6855 commit ff7322a
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ package deploymentreplicassyncer

import (
"context"
"fmt"
"encoding/json"
"time"

appsv1 "k8s.io/api/apps/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -38,6 +39,8 @@ import (
const (
// ControllerName is the controller name that will be used when reporting events.
ControllerName = "deployment-replicas-syncer"

waitDeploymentStatusInterval = 1 * time.Second
)

// DeploymentReplicasSyncer is to sync deployment replicas from status field to spec field.
Expand Down Expand Up @@ -98,6 +101,7 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
binding := &workv1alpha2.ResourceBinding{}
bindingName := names.GenerateBindingName(util.DeploymentKind, req.Name)

// 1. get latest binding
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: bindingName}, binding); err != nil {
if apierrors.IsNotFound(err) {
klog.Infof("no need to update deployment replicas for binding not found")
Expand All @@ -106,11 +110,12 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
return controllerruntime.Result{}, err
}

// if it is not divided schedule type, no need to update replicas
// 2. if it is not divided schedule type, no need to update replicas
if binding.Spec.Placement.ReplicaSchedulingType() != policyv1alpha1.ReplicaSchedulingTypeDivided {
return controllerruntime.Result{}, nil
}

// 3. get latest deployment
if err := r.Client.Get(ctx, req.NamespacedName, deployment); err != nil {
if apierrors.IsNotFound(err) {
klog.Infof("no need to update deployment replicas for deployment not found")
Expand All @@ -119,48 +124,83 @@ func (r *DeploymentReplicasSyncer) Reconcile(ctx context.Context, req controller
return controllerruntime.Result{}, err
}

// if replicas in spec already the same as in status, no need to update replicas
// 4. if replicas in spec already the same as in status, no need to update replicas
if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == deployment.Status.Replicas {
klog.Infof("replicas in spec field (%d) already equal to in status field (%d)", *deployment.Spec.Replicas, deployment.Status.Replicas)
return controllerruntime.Result{}, nil
}

// 5. judge the deployment modification in spec has taken effect and its status has been collected, otherwise retry.
if !isDeploymentStatusCollected(deployment, binding) {
return controllerruntime.Result{RequeueAfter: waitDeploymentStatusInterval}, nil
}

// 6. update spec replicas with status replicas
oldReplicas := *deployment.Spec.Replicas
deployment.Spec.Replicas = &deployment.Status.Replicas
if err := r.Client.Update(ctx, deployment); err != nil {
klog.Errorf("failed to update deployment (%s/%s) replicas: %+v", deployment.Namespace, deployment.Name, err)
return controllerruntime.Result{}, err
}

klog.Infof("successfully update deployment (%s/%s) replicas from %d to %d", deployment.Namespace,
deployment.Name, oldReplicas, deployment.Status.Replicas)

return controllerruntime.Result{}, nil
}

// isDeploymentStatusCollected judge whether deployment modification in spec has taken effect and its status has been collected.
func isDeploymentStatusCollected(deployment *appsv1.Deployment, binding *workv1alpha2.ResourceBinding) bool {
// make sure the replicas change in deployment.spec can sync to binding.spec, otherwise retry
if deployment.Spec.Replicas == nil || *deployment.Spec.Replicas != binding.Spec.Replicas {
klog.V(4).Infof("wait until replicas of binding (%d) equal to replicas of deployment (%d)",
binding.Spec.Replicas, *deployment.Spec.Replicas)
return controllerruntime.Result{}, fmt.Errorf("retry to wait replicas change sync to binding")
return false
}

// make sure the scheduler observed generation equal to generation in binding, otherwise retry
if binding.Generation != binding.Status.SchedulerObservedGeneration {
klog.V(4).Infof("wait until scheduler observed generation (%d) equal to generation in binding (%d)",
binding.Status.SchedulerObservedGeneration, binding.Generation)
return controllerruntime.Result{}, fmt.Errorf("retry to wait scheduler observed generation")
return false
}

// make sure the number of aggregated status in binding is as expected.
if len(binding.Status.AggregatedStatus) != len(binding.Spec.Clusters) {
klog.V(4).Infof("wait until all clusters status collected, got: %d, expected: %d",
len(binding.Status.AggregatedStatus), len(binding.Spec.Clusters))
return controllerruntime.Result{}, fmt.Errorf("retry to wait status in binding collected")
return false
}

// make sure the status.replicas in binding has been collected from work to binding.
bindingStatusSumReplicas := int32(0)
for _, status := range binding.Status.AggregatedStatus {
if status.Status == nil {
klog.V(4).Infof("wait until aggregated status of cluster %s collected", status.ClusterName)
return controllerruntime.Result{}, fmt.Errorf("retry to wait status in binding collected")
return false
}
itemStatus := &appsv1.DeploymentStatus{}
if err := json.Unmarshal(status.Status.Raw, itemStatus); err != nil {
klog.Errorf("unmarshal status.raw of cluster %s in binding failed: %+v", status.ClusterName, err)
return false
}
// if member cluster deployment is controlled by hpa, its status.replicas must > 0 (since hpa.minReplicas > 0),
// so, if its status replicas is 0, means the status haven't been collected from member cluster, and needs waiting.
if itemStatus.Replicas <= 0 {
klog.V(4).Infof("wait until aggregated status replicas of cluster %s collected", status.ClusterName)
return false
}
bindingStatusSumReplicas += itemStatus.Replicas
}

// update replicas
oldReplicas := *deployment.Spec.Replicas
deployment.Spec.Replicas = &deployment.Status.Replicas
if err := r.Client.Update(ctx, deployment); err != nil {
klog.Errorf("failed to update deployment (%s/%s) replicas: %+v", deployment.Namespace, deployment.Name, err)
return controllerruntime.Result{}, err
// make sure the latest collected status.replicas is synced from binding to template.
// e.g: user set deployment.spec.replicas = 2, then sum replicas in binding.status becomes 2, however, if now
// deployment.status.replicas is still 1, we shouldn't update spec.replicas to 1 until status.replicas becomes 2.
if deployment.Status.Replicas != bindingStatusSumReplicas {
klog.V(4).Infof("wait until deployment status replicas (%d) equal to binding status replicas (%d)",
deployment.Status.Replicas, bindingStatusSumReplicas)
return false
}

klog.Infof("successfully udpate deployment (%s/%s) replicas from %d to %d", deployment.Namespace,
deployment.Namespace, oldReplicas, deployment.Status.Replicas)

return controllerruntime.Result{}, nil
return true
}
14 changes: 13 additions & 1 deletion pkg/resourceinterpreter/default/native/reflectstatus.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package native

import (
"bytes"
"fmt"

appsv1 "k8s.io/api/apps/v1"
Expand Down Expand Up @@ -74,7 +75,18 @@ func reflectDeploymentStatus(object *unstructured.Unstructured) (*runtime.RawExt
AvailableReplicas: deploymentStatus.AvailableReplicas,
UnavailableReplicas: deploymentStatus.UnavailableReplicas,
}
return helper.BuildStatusRawExtension(grabStatus)

grabStatusRaw, err := helper.BuildStatusRawExtension(grabStatus)
if err != nil {
return nil, err
}

// if status is empty struct, it actually means status haven't been collected from member cluster.
if bytes.Equal(grabStatusRaw.Raw, []byte("{}")) {
return nil, nil
}

return grabStatusRaw, nil
}

func reflectServiceStatus(object *unstructured.Unstructured) (*runtime.RawExtension, error) {
Expand Down

0 comments on commit ff7322a

Please sign in to comment.