Skip to content

Commit

Permalink
Add support for KCP remediation during cluster provisioning
Browse files Browse the repository at this point in the history
Co-authored-by: sbueringer <buringerst@vmware.com>
  • Loading branch information
fabriziopandini committed Feb 9, 2023
1 parent bc98c72 commit e98e2de
Show file tree
Hide file tree
Showing 30 changed files with 2,483 additions and 306 deletions.
7 changes: 7 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down
2 changes: 2 additions & 0 deletions controlplane/kubeadm/api/v1alpha3/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions controlplane/kubeadm/api/v1alpha4/conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ func (src *KubeadmControlPlane) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.RemediationStrategy != nil {
dst.Spec.RemediationStrategy = restored.Spec.RemediationStrategy
}
if restored.Status.LastRemediation != nil {
dst.Status.LastRemediation = restored.Status.LastRemediation
}

return nil
}

Expand Down Expand Up @@ -173,6 +180,10 @@ func (src *KubeadmControlPlaneTemplate) ConvertTo(dstRaw conversion.Hub) error {
dst.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy = restored.Spec.Template.Spec.KubeadmConfigSpec.InitConfiguration.NodeRegistration.ImagePullPolicy
}

if restored.Spec.Template.Spec.RemediationStrategy != nil {
dst.Spec.Template.Spec.RemediationStrategy = restored.Spec.Template.Spec.RemediationStrategy
}

return nil
}

Expand Down Expand Up @@ -262,5 +273,11 @@ func Convert_v1beta1_KubeadmControlPlaneMachineTemplate_To_v1alpha4_KubeadmContr

func Convert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in *controlplanev1.KubeadmControlPlaneSpec, out *KubeadmControlPlaneSpec, scope apiconversion.Scope) error {
// .RolloutBefore was added in v1beta1.
// .RemediationStrategy was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneSpec_To_v1alpha4_KubeadmControlPlaneSpec(in, out, scope)
}

func Convert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in *controlplanev1.KubeadmControlPlaneStatus, out *KubeadmControlPlaneStatus, scope apiconversion.Scope) error {
// .LastRemediation was added in v1beta1.
return autoConvert_v1beta1_KubeadmControlPlaneStatus_To_v1alpha4_KubeadmControlPlaneStatus(in, out, scope)
}
17 changes: 7 additions & 10 deletions controlplane/kubeadm/api/v1alpha4/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 86 additions & 0 deletions controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package v1beta1

import (
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
Expand Down Expand Up @@ -49,6 +51,23 @@ const (
// KubeadmClusterConfigurationAnnotation is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration.
// This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP.
KubeadmClusterConfigurationAnnotation = "controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration"

// RemediationInProgressAnnotation is used to keep track that a KCP remediation is in progress, and more
// specifically it tracks that the system is in between having deleted an unhealthy machine and recreating its replacement.
// NOTE: if something external to CAPI removes this annotation the system cannot detect the above situation; this can lead to
// failures in updating remediation retry or remediation count (both counters restart from zero).
RemediationInProgressAnnotation = "controlplane.cluster.x-k8s.io/remediation-in-progress"

// RemediationForAnnotation is used to link a new machine to the unhealthy machine it is replacing;
// please note that in case of retry, when also the remediating machine fails, the system keeps track of
// the first machine of the sequence only.
// NOTE: if something external to CAPI removes this annotation the system this can lead to
// failures in updating remediation retry (the counter restarts from zero).
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"

// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
// machine unrelated from the previous remediation.
DefaultMinHealthyPeriod = 1 * time.Hour
)

// KubeadmControlPlaneSpec defines the desired state of KubeadmControlPlane.
Expand Down Expand Up @@ -91,6 +110,10 @@ type KubeadmControlPlaneSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneMachineTemplate defines the template for Machines
Expand Down Expand Up @@ -158,6 +181,50 @@ type RollingUpdate struct {
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}

// RemediationStrategy allows to define how control plane machine remediation happens.
type RemediationStrategy struct {
// MaxRetry is the Max number of retries while attempting to remediate an unhealthy machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
// For example, given a control plane with three machines M1, M2, M3:
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems while bootstrapping it will become unhealthy, and then be
// remediated; such operation is considered a retry, remediation-retry #1.
// If M1-2 (replacement of M1-2) becomes unhealthy, remediation-retry #2 will happen, etc.
//
// A retry could happen only after RetryPeriod from the previous retry.
// If a machine is marked as unhealthy after MinHealthyPeriod from the previous remediation expired,
// this is not considered a retry anymore because the new issue is assumed unrelated from the previous one.
//
// If not set, the remedation will be retried infinitely.
// +optional
MaxRetry *int32 `json:"maxRetry,omitempty"`

// RetryPeriod is the duration that KCP should wait before remediating a machine being created as a replacement
// for an unhealthy machine (a retry).
//
// If not set, a retry will happen immediately.
// +optional
RetryPeriod metav1.Duration `json:"retryPeriod,omitempty"`

// MinHealthyPeriod defines the duration after which KCP will consider any failure to a machine unrelated
// from the previous one. In this case the remediation is not considered a retry anymore, and thus the retry
// counter restarts from 0. For example, assuming MinHealthyPeriod is set to 1h (default)
//
// M1 become unhealthy; remediation happens, and M1-1 is created as a replacement.
// If M1-1 (replacement of M1) has problems within the 1hr after the creation, also
// this machine will be remediated and this operation is considered a retry - a problem related
// to the original issue happened to M1 -.
//
// If instead the problem on M1-1 is happening after MinHealthyPeriod expired, e.g. four days after
// m1-1 has been created as a remediation of M1, the problem on M1-1 is considered unrelated to
// the original issue happened to M1.
//
// If not set, this value is defaulted to 1h.
// +optional
MinHealthyPeriod *metav1.Duration `json:"minHealthyPeriod,omitempty"`
}

// KubeadmControlPlaneStatus defines the observed state of KubeadmControlPlane.
type KubeadmControlPlaneStatus struct {
// Selector is the label selector in string format to avoid introspection
Expand Down Expand Up @@ -223,6 +290,25 @@ type KubeadmControlPlaneStatus struct {
// Conditions defines current service state of the KubeadmControlPlane.
// +optional
Conditions clusterv1.Conditions `json:"conditions,omitempty"`

// LastRemediation stores info about last remediation performed.
// +optional
LastRemediation *LastRemediationStatus `json:"lastRemediation,omitempty"`
}

// LastRemediationStatus stores info about last remediation performed.
// NOTE: if for any reason information about last remediation are lost, RetryCount is going to restart from 0 and thus
// more remediations than expected might happen.
type LastRemediationStatus struct {
// Machine is the machine name of the latest machine being remediated.
Machine string `json:"machine"`

// Timestamp is when last remediation happened. It is represented in RFC3339 form and is in UTC.
Timestamp metav1.Time `json:"timestamp"`

// RetryCount used to keep track of remediation retry for the last remediated machine.
// A retry happens when a machine that was created as a replacement for an unhealthy machine also fails.
RetryCount int32 `json:"retryCount"`
}

// +kubebuilder:object:root=true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ func (in *KubeadmControlPlane) ValidateUpdate(old runtime.Object) error {
{spec, "machineTemplate", "nodeDeletionTimeout"},
{spec, "replicas"},
{spec, "version"},
{spec, "remediationStrategy"},
{spec, "remediationStrategy", "*"},
{spec, "rolloutAfter"},
{spec, "rolloutBefore"},
{spec, "rolloutBefore", "*"},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,11 @@ func TestKubeadmControlPlaneValidateUpdate(t *testing.T) {
validUpdate.Spec.RolloutBefore = &RolloutBefore{
CertificatesExpiryDays: pointer.Int32(14),
}
validUpdate.Spec.RemediationStrategy = &RemediationStrategy{
MaxRetry: pointer.Int32(50),
MinHealthyPeriod: &metav1.Duration{Duration: 10 * time.Hour},
RetryPeriod: metav1.Duration{Duration: 10 * time.Minute},
}
validUpdate.Spec.KubeadmConfigSpec.Format = bootstrapv1.CloudConfig

scaleToZero := before.DeepCopy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ type KubeadmControlPlaneTemplateResourceSpec struct {
// +optional
// +kubebuilder:default={type: "RollingUpdate", rollingUpdate: {maxSurge: 1}}
RolloutStrategy *RolloutStrategy `json:"rolloutStrategy,omitempty"`

// The RemediationStrategy that controls how control plane machine remediation happens.
// +optional
RemediationStrategy *RemediationStrategy `json:"remediationStrategy,omitempty"`
}

// KubeadmControlPlaneTemplateMachineTemplate defines the template for Machines
Expand Down
57 changes: 57 additions & 0 deletions controlplane/kubeadm/api/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit e98e2de

Please sign in to comment.