Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add maxsurge option for control plane upgrade #188

Merged
merged 4 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions controlplane/api/v1alpha1/rke2controlplane_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package v1alpha1
import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"

Expand Down Expand Up @@ -71,6 +72,11 @@ type RKE2ControlPlaneSpec struct {
// the registration type is "address". Its for scenarios where a load-balancer or VIP is used.
// +optional
RegistrationAddress string `json:"registrationAddress,omitempty"`

// The RolloutStrategy to use to replace control plane machines with new ones.
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
richardcase marked this conversation as resolved.
Show resolved Hide resolved
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`
}

// RKE2ServerConfig specifies configuration for the agent nodes.
Expand Down Expand Up @@ -343,6 +349,40 @@ const (
MetricsServer DisabledPluginComponent = "rke2-metrics-server"
)

// RolloutStrategy describes how to replace existing machines
// with new ones.
type RolloutStrategy struct {
// Type of rollout. Currently the only supported strategy is "RollingUpdate".
// Default is RollingUpdate.
// +optional
Type RolloutStrategyType `json:"type,omitempty"`

// Rolling update config params. Present only if RolloutStrategyType = RollingUpdate.
// +optional
RollingUpdate *RollingUpdate `json:"rollingUpdate,omitempty"`
}

// RollingUpdate is used to control the desired behavior of rolling update.
type RollingUpdate struct {
// The maximum number of control planes that can be scheduled above or under the
// desired number of control planes.
// Value can be an absolute number 1 or 0.
// Defaults to 1.
// Example: when this is set to 1, the control plane can be scaled
// up immediately when the rolling update starts.
// +optional
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}

// RolloutStrategyType defines the rollout strategies for a RKE2ControlPlane.
type RolloutStrategyType string

const (
// RollingUpdateStrategyType replaces the old control planes by new one using rolling update
// i.e. gradually scale up or down the old control planes and scale up or down the new one.
RollingUpdateStrategyType RolloutStrategyType = "RollingUpdate"
)

func init() { //nolint:gochecknoinits
SchemeBuilder.Register(&RKE2ControlPlane{}, &RKE2ControlPlaneList{})
}
Expand Down
46 changes: 46 additions & 0 deletions controlplane/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,34 @@ spec:
description: Replicas is the number of replicas for the Control Plane.
format: int32
type: integer
rolloutStrategy:
default:
rollingUpdate:
maxSurge: 1
type: RollingUpdate
description: The RolloutStrategy to use to replace control plane machines
with new ones.
properties:
rollingUpdate:
description: Rolling update config params. Present only if RolloutStrategyType
= RollingUpdate.
properties:
maxSurge:
anyOf:
- type: integer
- type: string
description: 'The maximum number of control planes that can
be scheduled above or under the desired number of control
planes. Value can be an absolute number 1 or 0. Defaults
to 1. Example: when this is set to 1, the control plane
can be scaled up immediately when the rolling update starts.'
x-kubernetes-int-or-string: true
type: object
type:
description: Type of rollout. Currently the only supported strategy
is "RollingUpdate". Default is RollingUpdate.
type: string
type: object
serverConfig:
description: ServerConfig specifies configuration for the agent nodes.
properties:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ func patchRKE2ControlPlane(ctx context.Context, patchHelper *patch.Helper, rcp *
func (r *RKE2ControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
c, err := ctrl.NewControllerManagedBy(mgr).
For(&controlplanev1.RKE2ControlPlane{}).
Owns(&clusterv1.Machine{}).
Build(r)
if err != nil {
return errors.Wrap(err, "failed setting up with a controller manager")
Expand Down Expand Up @@ -742,14 +743,22 @@ func (r *RKE2ControlPlaneReconciler) upgradeControlPlane(
return ctrl.Result{}, err
}

status := workloadCluster.ClusterStatus()
switch rcp.Spec.RolloutStrategy.Type {
case controlplanev1.RollingUpdateStrategyType:
// RolloutStrategy is currently defaulted and validated to be RollingUpdate.
maxNodes := *rcp.Spec.Replicas + int32(rcp.Spec.RolloutStrategy.RollingUpdate.MaxSurge.IntValue())
if int32(controlPlane.Machines.Len()) < maxNodes {
richardcase marked this conversation as resolved.
Show resolved Hide resolved
// scaleUpControlPlane ensures that we don't continue scaling up while waiting for Machines to have NodeRefs
return r.scaleUpControlPlane(ctx, cluster, rcp, controlPlane)
}

if status.Nodes <= *rcp.Spec.Replicas {
// scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs
return r.scaleUpControlPlane(ctx, cluster, rcp, controlPlane)
}
return r.scaleDownControlPlane(ctx, cluster, rcp, controlPlane, machinesRequireUpgrade)
default:
err := fmt.Errorf("unknown rollout strategy type %q", rcp.Spec.RolloutStrategy.Type)
logger.Error(err, "RolloutStrategy type is not set to RollingUpdateStrategyType, unable to determine the strategy for rolling out machines")

return r.scaleDownControlPlane(ctx, cluster, rcp, controlPlane, machinesRequireUpgrade)
return ctrl.Result{}, nil
}
}

// ClusterToRKE2ControlPlane is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ import (
const (
KubernetesVersionManagement = "KUBERNETES_VERSION_MANAGEMENT"
KubernetesVersion = "KUBERNETES_VERSION"
KubernetesVersionUpgradeFrom = "KUBERNETES_VERSION_UPGRADE_FROM"
KubernetesVersionUpgradeTo = "KUBERNETES_VERSION_UPGRADE_TO"
CPMachineTemplateUpgradeTo = "CONTROL_PLANE_MACHINE_TEMPLATE_UPGRADE_TO"
WorkersMachineTemplateUpgradeTo = "WORKERS_MACHINE_TEMPLATE_UPGRADE_TO"
IPFamily = "IP_FAMILY"
KindImageVersion = "KIND_IMAGE_VERSION"
)

func Byf(format string, a ...interface{}) {
Expand Down
8 changes: 4 additions & 4 deletions test/e2e/config/e2e_conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@ providers:
new: "--leader-elect=false"

variables:
KUBERNETES_VERSION_MANAGEMENT: "v1.24.4"
KUBERNETES_VERSION: "v1.24.4"
KUBERNETES_VERSION_MANAGEMENT: "v1.28.0"
KUBERNETES_VERSION: "v1.28.1"
KIND_IMAGE_VERSION: "v1.28.0"
NODE_DRAIN_TIMEOUT: "60s"
CONFORMANCE_WORKER_MACHINE_COUNT: "2"
CONFORMANCE_CONTROL_PLANE_MACHINE_COUNT: "1"
KUBERNETES_VERSION_UPGRADE_TO: "v1.24.2"
KUBERNETES_VERSION_UPGRADE_FROM: "v1.23.8"
KUBERNETES_VERSION_UPGRADE_TO: "v1.28.2"
KUBERNETES_UPGRADE_OCI_IMAGE_ID: "${KUBERNETES_UPGRADE_OCI_IMAGE_ID}"
IP_FAMILY: "IPv4"
EXP_CLUSTER_RESOURCE_SET: "true"
Expand Down
8 changes: 5 additions & 3 deletions test/e2e/data/infrastructure/cluster-template-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ metadata:
name: "${CLUSTER_NAME}-control-plane"
spec:
template:
spec: {}
spec:
customImage: kindest/node:${KIND_IMAGE_VERSION}
richardcase marked this conversation as resolved.
Show resolved Hide resolved
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
Expand Down Expand Up @@ -80,7 +81,8 @@ metadata:
name: ${CLUSTER_NAME}-md-0
spec:
template:
spec: {}
spec:
customImage: kindest/node:${KIND_IMAGE_VERSION}
---
apiVersion: bootstrap.cluster.x-k8s.io/v1alpha1
kind: RKE2ConfigTemplate
Expand All @@ -92,4 +94,4 @@ spec:
agentConfig:
version: ${KUBERNETES_VERSION}+rke2r1
nodeAnnotations:
test: "true"
test: "true"
3 changes: 3 additions & 0 deletions test/e2e/e2e_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"

bootstrapv1 "github.com/rancher-sandbox/cluster-api-provider-rke2/bootstrap/api/v1alpha1"
controlplanev1 "github.com/rancher-sandbox/cluster-api-provider-rke2/controlplane/api/v1alpha1"
Expand Down Expand Up @@ -104,6 +105,8 @@ func init() {
func TestE2E(t *testing.T) {
RegisterFailHandler(Fail)

ctrl.SetLogger(klog.Background())

RunSpecs(t, "caprke2-e2e")
}

Expand Down
32 changes: 32 additions & 0 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,38 @@ var _ = Describe("Workload cluster creation", func() {
ControlPlane: result.ControlPlane,
}, e2eConfig.GetIntervals(specName, "wait-control-plane")...)

By("Upgrading control plane and worker machines")
ApplyClusterTemplateAndWait(ctx, ApplyClusterTemplateAndWaitInput{
richardcase marked this conversation as resolved.
Show resolved Hide resolved
ClusterProxy: bootstrapClusterProxy,
ConfigCluster: clusterctl.ConfigClusterInput{
LogFolder: clusterctlLogFolder,
ClusterctlConfigPath: clusterctlConfigPath,
KubeconfigPath: bootstrapClusterProxy.GetKubeconfigPath(),
InfrastructureProvider: "docker",
Flavor: "docker",
Namespace: namespace.Name,
ClusterName: clusterName,
KubernetesVersion: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),
ControlPlaneMachineCount: pointer.Int64Ptr(3),
WorkerMachineCount: pointer.Int64Ptr(3),
},
WaitForClusterIntervals: e2eConfig.GetIntervals(specName, "wait-cluster"),
WaitForControlPlaneIntervals: e2eConfig.GetIntervals(specName, "wait-control-plane"),
WaitForMachineDeployments: e2eConfig.GetIntervals(specName, "wait-worker-nodes"),
}, result)

WaitForClusterToUpgrade(ctx, WaitForClusterToUpgradeInput{
Lister: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
MachineDeployments: result.MachineDeployments,
VersionAfterUpgrade: e2eConfig.GetVariable(KubernetesVersionUpgradeTo),
}, e2eConfig.GetIntervals(specName, "wait-control-plane")...)

WaitForControlPlaneToBeReady(ctx, WaitForControlPlaneToBeReadyInput{
Getter: bootstrapClusterProxy.GetClient(),
ControlPlane: result.ControlPlane,
}, e2eConfig.GetIntervals(specName, "wait-control-plane")...)

// TODO: this can be uncommented when control plane scaling down is working

// By("Scaling control plane nodes to 1")
Expand Down
38 changes: 38 additions & 0 deletions test/e2e/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package e2e

import (
"context"
"fmt"
"time"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -251,6 +252,43 @@ func WaitForMachineConditions(ctx context.Context, input WaitForMachineCondition
}, intervals...).Should(BeTrue(), framework.PrettyPrint(input.Machine)+"\n")
}

// WaitForClusterToUpgradeInput is the input for WaitForClusterToUpgrade.
type WaitForClusterToUpgradeInput struct {
Lister framework.Lister
ControlPlane *controlplanev1.RKE2ControlPlane
MachineDeployments []*clusterv1.MachineDeployment
VersionAfterUpgrade string
}

// WaitForClusterToUpgrade will wait for a cluster to be upgraded.
func WaitForClusterToUpgrade(ctx context.Context, input WaitForClusterToUpgradeInput, intervals ...interface{}) {
By("Waiting for machines to update")

totallMachineCount := *input.ControlPlane.Spec.Replicas
for _, md := range input.MachineDeployments {
totallMachineCount += *md.Spec.Replicas
}

Eventually(func() (bool, error) {
machineList := &clusterv1.MachineList{}
if err := input.Lister.List(ctx, machineList); err != nil {
return false, fmt.Errorf("failed to list machines: %w", err)
}

if len(machineList.Items) != int(totallMachineCount) { // not all replicas are created
return false, nil
}

for _, machine := range machineList.Items {
if machine.Spec.Version != nil && *machine.Spec.Version != input.VersionAfterUpgrade {
alexander-demicev marked this conversation as resolved.
Show resolved Hide resolved
return false, nil
}
}

return true, nil
}, intervals...).Should(BeTrue(), framework.PrettyPrint(input.ControlPlane)+"\n")
}

func setDefaults(input *ApplyClusterTemplateAndWaitInput) {
if input.WaitForControlPlaneInitialized == nil {
input.WaitForControlPlaneInitialized = func(ctx context.Context, input ApplyClusterTemplateAndWaitInput, result *ApplyClusterTemplateAndWaitResult) {
Expand Down