Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Runtime sdk/flaky e2e fix #6817

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 63 additions & 92 deletions test/e2e/cluster_upgrade_runtimesdk.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
runtimev1 "sigs.k8s.io/cluster-api/exp/runtime/api/v1alpha1"
"sigs.k8s.io/cluster-api/test/e2e/internal/log"
"sigs.k8s.io/cluster-api/test/framework"
Expand All @@ -44,8 +43,6 @@ import (
"sigs.k8s.io/cluster-api/util/conditions"
)

var hookFailedMessage = "hook failed"

// clusterUpgradeWithRuntimeSDKSpecInput is the input for clusterUpgradeWithRuntimeSDKSpec.
type clusterUpgradeWithRuntimeSDKSpecInput struct {
E2EConfig *clusterctl.E2EConfig
Expand Down Expand Up @@ -83,14 +80,14 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl
var (
input clusterUpgradeWithRuntimeSDKSpecInput
namespace *corev1.Namespace
ext *runtimev1.ExtensionConfig
cancelWatches context.CancelFunc

controlPlaneMachineCount int64
workerMachineCount int64

clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
testExtensionPath string
clusterName string
)

BeforeEach(func() {
Expand Down Expand Up @@ -123,11 +120,12 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl

// Set up a Namespace where to host objects for this spec and create a watcher for the Namespace events.
namespace, cancelWatches = setupSpecNamespace(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder)
clusterName = fmt.Sprintf("%s-%s", specName, util.RandomString(6))

clusterResources = new(clusterctl.ApplyClusterTemplateAndWaitResult)
})

It("Should create, upgrade and delete a workload cluster", func() {
clusterName := fmt.Sprintf("%s-%s", specName, util.RandomString(6))
By("Deploy Test Extension")
testExtensionDeploymentTemplate, err := os.ReadFile(testExtensionPath) //nolint:gosec
Expect(err).ToNot(HaveOccurred(), "Failed to read the extension deployment manifest file")
Expand All @@ -141,12 +139,14 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl
Expect(input.BootstrapClusterProxy.Apply(ctx, []byte(testExtensionDeployment), "--namespace", namespace.Name)).To(Succeed())

By("Deploy Test Extension ExtensionConfig and ConfigMap")
ext = extensionConfig(specName, namespace)
err = input.BootstrapClusterProxy.GetClient().Create(ctx, ext)
Expect(err).ToNot(HaveOccurred(), "Failed to create the extension config")
responses := responsesConfigMap(clusterName, namespace)
err = input.BootstrapClusterProxy.GetClient().Create(ctx, responses)
Expect(err).ToNot(HaveOccurred(), "Failed to create the responses configmap")

Expect(input.BootstrapClusterProxy.GetClient().Create(ctx,
extensionConfig(specName, namespace))).
To(Succeed(), "Failed to create the extension config")

Expect(input.BootstrapClusterProxy.GetClient().Create(ctx,
responsesConfigMap(clusterName, namespace))).
To(Succeed(), "Failed to create the responses configMap")

By("Creating a workload cluster")

Expand Down Expand Up @@ -182,8 +182,6 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl
ClusterProxy: input.BootstrapClusterProxy,
Cluster: clusterResources.Cluster,
ControlPlane: clusterResources.ControlPlane,
EtcdImageTag: input.E2EConfig.GetVariable(EtcdVersionUpgradeTo),
DNSImageTag: input.E2EConfig.GetVariable(CoreDNSVersionUpgradeTo),
MachineDeployments: clusterResources.MachineDeployments,
KubernetesUpgradeVersion: input.E2EConfig.GetVariable(KubernetesVersionUpgradeTo),
WaitForMachinesToBeUpgraded: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
Expand All @@ -195,6 +193,7 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl
input.BootstrapClusterProxy.GetClient(),
namespace.Name,
clusterName,
input.E2EConfig.GetVariable(KubernetesVersionUpgradeTo),
input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"))
},
PreWaitForMachineDeploymentToBeUpgraded: func() {
Expand Down Expand Up @@ -236,26 +235,26 @@ func clusterUpgradeWithRuntimeSDKSpec(ctx context.Context, inputGetter func() cl

By("Checking all lifecycle hooks have been called")
// Assert that each hook has been called and returned "Success" during the test.
err = checkLifecycleHookResponses(ctx, input.BootstrapClusterProxy.GetClient(), namespace.Name, clusterName, map[string]string{
"BeforeClusterCreate": "Success",
"BeforeClusterUpgrade": "Success",
"BeforeClusterDelete": "Success",
Expect(checkLifecycleHookResponses(ctx, input.BootstrapClusterProxy.GetClient(), namespace.Name, clusterName, map[string]string{
"BeforeClusterCreate": "Status: Success, RetryAfterSeconds: 0",
"BeforeClusterUpgrade": "Status: Success, RetryAfterSeconds: 0",
"BeforeClusterDelete": "Status: Success, RetryAfterSeconds: 0",
"AfterControlPlaneUpgrade": "Status: Success, RetryAfterSeconds: 0",
"AfterControlPlaneInitialized": "Success",
"AfterControlPlaneUpgrade": "Success",
"AfterClusterUpgrade": "Success",
})
Expect(err).ToNot(HaveOccurred(), "Lifecycle hook calls were not as expected")
})).To(Succeed(), "Lifecycle hook calls were not as expected")

By("PASSED!")
})

AfterEach(func() {
// Delete the extensionConfig first to ensure the BeforeDeleteCluster hook doesn't block deletion.
Eventually(func() error {
return input.BootstrapClusterProxy.GetClient().Delete(ctx, extensionConfig(specName, namespace))
}, 10*time.Second, 1*time.Second).Should(Succeed(), "delete extensionConfig failed")

// Dumps all the resources in the spec Namespace, then cleanups the cluster object and the spec Namespace itself.
dumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ArtifactFolder, namespace, cancelWatches, clusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup)

Eventually(func() error {
return input.BootstrapClusterProxy.GetClient().Delete(ctx, ext)
}, 10*time.Second, 1*time.Second).Should(Succeed())
})
}

Expand Down Expand Up @@ -303,11 +302,11 @@ func responsesConfigMap(name string, namespace *corev1.Namespace) *corev1.Config
},
// Set the initial preloadedResponses for each of the tested hooks.
Data: map[string]string{
// Blocking hooks are set to Status:Failure initially. These will be changed during the test.
"BeforeClusterCreate-preloadedResponse": fmt.Sprintf(`{"Status": "Failure", "Message": %q}`, hookFailedMessage),
"BeforeClusterUpgrade-preloadedResponse": fmt.Sprintf(`{"Status": "Failure", "Message": %q}`, hookFailedMessage),
"AfterControlPlaneUpgrade-preloadedResponse": fmt.Sprintf(`{"Status": "Failure", "Message": %q}`, hookFailedMessage),
"BeforeClusterDelete-preloadedResponse": fmt.Sprintf(`{"Status": "Failure", "Message": %q}`, hookFailedMessage),
// Blocking hooks are set to return RetryAfterSeconds initially. These will be changed during the test.
"BeforeClusterCreate-preloadedResponse": `{"Status": "Success", "RetryAfterSeconds": 5}`,
"BeforeClusterUpgrade-preloadedResponse": `{"Status": "Success", "RetryAfterSeconds": 5}`,
"AfterControlPlaneUpgrade-preloadedResponse": `{"Status": "Success", "RetryAfterSeconds": 5}`,
"BeforeClusterDelete-preloadedResponse": `{"Status": "Success", "RetryAfterSeconds": 5}`,

// Non-blocking hooks are set to Status:Success.
"AfterControlPlaneInitialized-preloadedResponse": `{"Status": "Success"}`,
Expand Down Expand Up @@ -359,42 +358,29 @@ func beforeClusterCreateTestHandler(ctx context.Context, c client.Client, namesp
runtimeHookTestHandler(ctx, c, namespace, clusterName, hookName, true, func() bool {
blocked := true
// This hook should block the Cluster from entering the "Provisioned" state.
cluster := &clusterv1.Cluster{}
Eventually(func() error {
return c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, cluster)
}).Should(Succeed())
cluster := framework.GetClusterByName(ctx,
framework.GetClusterByNameInput{Name: clusterName, Namespace: namespace, Getter: c})

// Check if the TopologyReconciled condition message contains both the hook name and hookFailedMessage.
if !clusterConditionShowsHookFailed(cluster, hookName) {
blocked = false
}
if cluster.Status.Phase == string(clusterv1.ClusterPhaseProvisioned) {
blocked = false
}
return blocked
}, intervals)
}

// beforeClusterUpgradeTestHandler calls runtimeHookTestHandler with a blocking function which returns false if the
// Cluster has controlplanev1.RollingUpdateInProgressReason in its ReadyCondition.
func beforeClusterUpgradeTestHandler(ctx context.Context, c client.Client, namespace, clusterName string, intervals []interface{}) {
// beforeClusterUpgradeTestHandler calls runtimeHookTestHandler with a blocking function which returns false if
// any of the machines in the control plane has been updated to the target Kubernetes version.
func beforeClusterUpgradeTestHandler(ctx context.Context, c client.Client, namespace, clusterName, toVersion string, intervals []interface{}) {
hookName := "BeforeClusterUpgrade"
runtimeHookTestHandler(ctx, c, namespace, clusterName, hookName, true, func() bool {
var blocked = true

cluster := &clusterv1.Cluster{}
Eventually(func() error {
return c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, cluster)
}).Should(Succeed())

// Check if the TopologyReconciled condition message contains both the hook name and hookFailedMessage.
if !clusterConditionShowsHookFailed(cluster, hookName) {
blocked = false
}
// Check if the Cluster is showing the RollingUpdateInProgress condition reason. If it has the update process is unblocked.
if conditions.IsFalse(cluster, clusterv1.ReadyCondition) &&
conditions.GetReason(cluster, clusterv1.ReadyCondition) == controlplanev1.RollingUpdateInProgressReason {
blocked = false
controlPlaneMachines := framework.GetControlPlaneMachinesByCluster(ctx,
framework.GetControlPlaneMachinesByClusterInput{Lister: c, ClusterName: clusterName, Namespace: namespace})
for _, machine := range controlPlaneMachines {
if *machine.Spec.Version == toVersion {
blocked = false
}
}
return blocked
}, intervals)
Expand All @@ -406,26 +392,11 @@ func afterControlPlaneUpgradeTestHandler(ctx context.Context, c client.Client, n
hookName := "AfterControlPlaneUpgrade"
runtimeHookTestHandler(ctx, c, namespace, clusterName, hookName, true, func() bool {
var blocked = true
cluster := &clusterv1.Cluster{}
Eventually(func() error {
return c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, cluster)
}).Should(Succeed())

// Check if the TopologyReconciled condition message contains both the hook name and hookFailedMessage.
if !clusterConditionShowsHookFailed(cluster, hookName) {
blocked = false
}

mds := &clusterv1.MachineDeploymentList{}
Eventually(func() error {
return c.List(ctx, mds, client.MatchingLabels{
clusterv1.ClusterLabelName: clusterName,
clusterv1.ClusterTopologyOwnedLabel: "",
})
}).Should(Succeed())

mds := framework.GetMachineDeploymentsByCluster(ctx,
framework.GetMachineDeploymentsByClusterInput{ClusterName: clusterName, Namespace: namespace, Lister: c})
// If any of the MachineDeployments have the target Kubernetes Version, the hook is unblocked.
for _, md := range mds.Items {
for _, md := range mds {
if *md.Spec.Template.Spec.Version == version {
blocked = false
}
Expand Down Expand Up @@ -464,23 +435,23 @@ func runtimeHookTestHandler(ctx context.Context, c client.Client, namespace, clu
if err := checkLifecycleHooksCalledAtLeastOnce(ctx, c, namespace, clusterName, []string{hookName}); err != nil {
return err
}
cluster := &clusterv1.Cluster{}
if err := c.Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, cluster); err != nil {
return err
}

// Check for the existence of the condition if withTopologyReconciledCondition is true.
if withTopologyReconciledCondition &&
(conditions.GetReason(cluster, clusterv1.TopologyReconciledCondition) != clusterv1.TopologyReconcileFailedReason) {
return errors.New("Condition not found on Cluster object")
if withTopologyReconciledCondition {
cluster := framework.GetClusterByName(ctx, framework.GetClusterByNameInput{
Name: clusterName, Namespace: namespace, Getter: c})

if !clusterConditionShowsHookBlocking(cluster, hookName) {
return errors.Errorf("Blocking condition for %s not found on Cluster object", hookName)
}
}
return nil
}, 60*time.Second).Should(Succeed(), "%s has not been called", hookName)
}, 30*time.Second).Should(Succeed(), "%s has not been called", hookName)

// blockingCondition should consistently be true as the Runtime hook is returning "Failure".
Consistently(func() bool {
return blockingCondition()
}, 30*time.Second).Should(BeTrue(),
}, 60*time.Second).Should(BeTrue(),
fabriziopandini marked this conversation as resolved.
Show resolved Hide resolved
fmt.Sprintf("Cluster Topology reconciliation continued unexpectedly: hook %s not blocking", hookName))

// Patch the ConfigMap to set the hook response to "Success".
Expand All @@ -500,32 +471,32 @@ func runtimeHookTestHandler(ctx context.Context, c client.Client, namespace, clu
Eventually(func() bool {
return blockingCondition()
}, intervals...).Should(BeFalse(),
fmt.Sprintf("ClusterTopology reconcile did not unblock after updating hook response: hook %s still blocking", hookName))
fmt.Sprintf("ClusterTopology reconcile did proceed as expected when calling %s", hookName))
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
}

// clusterConditionShowsHookFailed checks if the TopologyReconciled condition message contains both the hook name and hookFailedMessage.
func clusterConditionShowsHookFailed(cluster *clusterv1.Cluster, hookName string) bool {
return conditions.GetReason(cluster, clusterv1.TopologyReconciledCondition) == clusterv1.TopologyReconcileFailedReason &&
strings.Contains(conditions.GetMessage(cluster, clusterv1.TopologyReconciledCondition), hookFailedMessage) &&
// clusterConditionShowsHookBlocking checks if the TopologyReconciled condition message contains both the hook name and hookFailedMessage.
func clusterConditionShowsHookBlocking(cluster *clusterv1.Cluster, hookName string) bool {
return conditions.GetReason(cluster, clusterv1.TopologyReconciledCondition) == clusterv1.TopologyReconciledHookBlockingReason &&
strings.Contains(conditions.GetMessage(cluster, clusterv1.TopologyReconciledCondition), hookName)
}

func dumpAndDeleteCluster(ctx context.Context, proxy framework.ClusterProxy, namespace, clusterName, artifactFolder string) {
By("Deleting the workload cluster")
cluster := &clusterv1.Cluster{}
Eventually(func() error {
return proxy.GetClient().Get(ctx, client.ObjectKey{Namespace: namespace, Name: clusterName}, cluster)
}).Should(Succeed())

cluster := framework.GetClusterByName(ctx, framework.GetClusterByNameInput{
Name: clusterName, Namespace: namespace, Getter: proxy.GetClient()})

// Dump all the logs from the workload cluster before deleting them.
proxy.CollectWorkloadClusterLogs(ctx, cluster.Namespace, cluster.Name, filepath.Join(artifactFolder, "clusters-beforeClusterDelete", cluster.Name))
proxy.CollectWorkloadClusterLogs(ctx,
cluster.Namespace,
cluster.Name,
filepath.Join(artifactFolder, "clusters-beforeClusterDelete", cluster.Name))

// Dump all Cluster API related resources to artifacts before deleting them.
framework.DumpAllResources(ctx, framework.DumpAllResourcesInput{
Lister: proxy.GetClient(),
Namespace: namespace,
LogPath: filepath.Join(artifactFolder, "clusters-beforeClusterDelete", proxy.GetName(), "resources"),
})
LogPath: filepath.Join(artifactFolder, "clusters-beforeClusterDelete", proxy.GetName(), "resources")})

By("Deleting the workload cluster")
framework.DeleteCluster(ctx, framework.DeleteClusterInput{
killianmuldoon marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
17 changes: 13 additions & 4 deletions test/extension/handlers/lifecycle/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ func (h *Handler) readResponseFromConfigMap(ctx context.Context, name, namespace
if err := yaml.Unmarshal([]byte(configMap.Data[hookName+"-preloadedResponse"]), response); err != nil {
return errors.Wrapf(err, "failed to read %q response information from ConfigMap", hook)
}
if r, ok := response.(runtimehooksv1.RetryResponseObject); ok {
log := ctrl.LoggerFrom(ctx)
log.Info(fmt.Sprintf("%s response is %s. retry: %v", hookName, r.GetStatus(), r.GetRetryAfterSeconds()))
}
return nil
}

Expand All @@ -163,10 +167,15 @@ func (h *Handler) recordCallInConfigMap(ctx context.Context, name, namespace str
if err := h.Client.Get(ctx, client.ObjectKey{Namespace: namespace, Name: configMapName}, configMap); err != nil {
return errors.Wrapf(err, "failed to read the ConfigMap %s/%s", namespace, configMapName)
}

// Patch the actualResponseStatus with the returned value
patch := client.RawPatch(types.MergePatchType,
[]byte(fmt.Sprintf(`{"data":{"%s-actualResponseStatus":"%s"}}`, hookName, response.GetStatus()))) //nolint:gocritic
var patch client.Patch
killianmuldoon marked this conversation as resolved.
Show resolved Hide resolved
if r, ok := response.(runtimehooksv1.RetryResponseObject); ok {
patch = client.RawPatch(types.MergePatchType,
[]byte(fmt.Sprintf(`{"data":{"%s-actualResponseStatus": "Status: %s, RetryAfterSeconds: %v"}}`, hookName, r.GetStatus(), r.GetRetryAfterSeconds())))
} else {
// Patch the actualResponseStatus with the returned value
patch = client.RawPatch(types.MergePatchType,
[]byte(fmt.Sprintf(`{"data":{"%s-actualResponseStatus":"%s"}}`, hookName, response.GetStatus()))) //nolint:gocritic
}
if err := h.Client.Patch(ctx, configMap, patch); err != nil {
return errors.Wrapf(err, "failed to update the ConfigMap %s/%s", namespace, configMapName)
}
Expand Down
2 changes: 1 addition & 1 deletion test/framework/cluster_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ type DeleteClusterInput struct {
Cluster *clusterv1.Cluster
}

// DeleteCluster deletes the cluster and waits for everything the cluster owned to actually be gone.
// DeleteCluster deletes the cluster.
func DeleteCluster(ctx context.Context, input DeleteClusterInput) {
Byf("Deleting cluster %s", input.Cluster.GetName())
Expect(input.Deleter.Delete(ctx, input.Cluster)).To(Succeed())
Expand Down
34 changes: 20 additions & 14 deletions test/framework/cluster_topology_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,26 @@ func UpgradeClusterTopologyAndWaitForUpgrade(ctx context.Context, input UpgradeC
KubernetesVersion: input.KubernetesUpgradeVersion,
}, input.WaitForKubeProxyUpgrade...)

log.Logf("Waiting for CoreDNS to have the upgraded image tag")
WaitForDNSUpgrade(ctx, WaitForDNSUpgradeInput{
Getter: workloadClient,
DNSVersion: input.DNSImageTag,
}, input.WaitForDNSUpgrade...)

log.Logf("Waiting for etcd to have the upgraded image tag")
lblSelector, err := labels.Parse("component=etcd")
Expect(err).ToNot(HaveOccurred())
WaitForPodListCondition(ctx, WaitForPodListConditionInput{
Lister: workloadClient,
ListOptions: &client.ListOptions{LabelSelector: lblSelector},
Condition: EtcdImageTagCondition(input.EtcdImageTag, int(*input.ControlPlane.Spec.Replicas)),
}, input.WaitForEtcdUpgrade...)
// Wait for the CoreDNS upgrade if the DNSImageTag is set.
if input.DNSImageTag != "" {
log.Logf("Waiting for CoreDNS to have the upgraded image tag")
WaitForDNSUpgrade(ctx, WaitForDNSUpgradeInput{
Getter: workloadClient,
DNSVersion: input.DNSImageTag,
}, input.WaitForDNSUpgrade...)
}

// Wait for the etcd upgrade if the EtcdImageTag is set.
if input.EtcdImageTag != "" {
log.Logf("Waiting for etcd to have the upgraded image tag")
lblSelector, err := labels.Parse("component=etcd")
Expect(err).ToNot(HaveOccurred())
WaitForPodListCondition(ctx, WaitForPodListConditionInput{
Lister: workloadClient,
ListOptions: &client.ListOptions{LabelSelector: lblSelector},
Condition: EtcdImageTagCondition(input.EtcdImageTag, int(*input.ControlPlane.Spec.Replicas)),
}, input.WaitForEtcdUpgrade...)
}

// Once the ControlPlane is upgraded we can run PreWaitForMachineDeploymentToBeUpgraded.
// Note: This can e.g. be used to verify the AfterControlPlaneUpgrade lifecycle hook is executed
Expand Down
Loading