Skip to content

Commit

Permalink
add feature flag, add e2e tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Nicole Morales committed May 17, 2024
1 parent 93b53dc commit 9509765
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 52 deletions.
1 change: 1 addition & 0 deletions controllers/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type FoundationDBClusterReconciler struct {
ServerSideApply bool
EnableRecoveryState bool
CacheDatabaseStatusForReconciliationDefault bool
ReplaceOnSecurityContextChange bool
PodLifecycleManager podmanager.PodLifecycleManager
PodClientProvider func(*fdbv1beta2.FoundationDBCluster, *corev1.Pod) (podclient.FdbPodClient, error)
DatabaseClientProvider fdbadminclient.DatabaseClientProvider
Expand Down
2 changes: 1 addition & 1 deletion controllers/replace_misconfigured_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (c replaceMisconfiguredProcessGroups) reconcile(ctx context.Context, r *Fou
return &requeue{curError: err}
}

hasReplacements, err := replacements.ReplaceMisconfiguredProcessGroups(ctx, r.PodLifecycleManager, r, logger, cluster, internal.CreatePVCMap(cluster, pvcs))
hasReplacements, err := replacements.ReplaceMisconfiguredProcessGroups(ctx, r.PodLifecycleManager, r, logger, cluster, internal.CreatePVCMap(cluster, pvcs), r.ReplaceOnSecurityContextChange)
if err != nil {
return &requeue{curError: err}
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/update_pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ func getPodsToUpdate(ctx context.Context, logger logr.Logger, reconciler *Founda
continue
}

needsRemoval, err := replacements.ProcessGroupNeedsRemoval(ctx, reconciler.PodLifecycleManager, reconciler, logger, cluster, processGroup, pvcMap)
needsRemoval, err := replacements.ProcessGroupNeedsRemoval(ctx, reconciler.PodLifecycleManager, reconciler, logger, cluster, processGroup, pvcMap, reconciler.ReplaceOnSecurityContextChange)
// Do not update the Pod if unable to determine if it needs to be removed.
if err != nil {
logger.V(1).Info("Skip process group, error checking if it requires a removal",
Expand Down
30 changes: 29 additions & 1 deletion e2e/fixtures/fdb_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ func (fdbCluster *FdbCluster) UpdateClusterSpec() {
fdbCluster.UpdateClusterSpecWithSpec(fdbCluster.cluster.Spec.DeepCopy())
}

// UpdateClusterSpecWithSpec ensures that the FoundationDBCluster will be updated in Kubernetes. This method as a retry mechanism
// UpdateClusterSpecWithSpec ensures that the FoundationDBCluster will be updated in Kubernetes. This method has a retry mechanism
// implemented and ensures that the provided (local) Spec matches the Spec in Kubernetes. You must make sure that you call
// fdbCluster.GetCluster() before updating the spec, to make sure you are not overwriting the current state with an outdated state.
// An example on how to update a field with this method:
Expand Down Expand Up @@ -1117,6 +1117,34 @@ func (fdbCluster *FdbCluster) GetCustomParameters(
return fdbCluster.cluster.Spec.Processes[processClass].CustomParameters
}

// SetPodTemplateSpec allows to set the pod template spec of the provided process class.
func (fdbCluster *FdbCluster) SetPodTemplateSpec(
processClass fdbv1beta2.ProcessClass,
podTemplateSpec *corev1.PodSpec,
waitForReconcile bool,
) error {
setting, ok := fdbCluster.cluster.Spec.Processes[processClass]
if !ok {
return fmt.Errorf("could not find process settings for process class %s", processClass)
}
setting.PodTemplate.Spec = *podTemplateSpec

fdbCluster.cluster.Spec.Processes[processClass] = setting
fdbCluster.UpdateClusterSpec()
if !waitForReconcile {
return nil
}

return fdbCluster.WaitForReconciliation()
}

// GetPodTemplateSpec returns the current pod template spec for the specified process class.
func (fdbCluster *FdbCluster) GetPodTemplateSpec(
processClass fdbv1beta2.ProcessClass,
) *corev1.PodSpec {
return &fdbCluster.cluster.Spec.Processes[processClass].PodTemplate.Spec
}

// CheckPodIsDeleted return true if Pod no longer exists at the executed time point
func (fdbCluster *FdbCluster) CheckPodIsDeleted(podName string) bool {
pod := &corev1.Pod{}
Expand Down
1 change: 1 addition & 0 deletions e2e/fixtures/fdb_operator_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ spec:
- --minimum-recovery-time-for-exclusion=1.0
- --cluster-label-key-for-node-trigger=foundationdb.org/fdb-cluster-name
- --enable-node-index
- --replace-on-security-context-change
`
)

Expand Down
44 changes: 44 additions & 0 deletions e2e/test_operator/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
fdbCluster.EnsurePodIsDeleted(replacedPod.Name)
})
})

})

When("setting storageServersPerPod", func() {
Expand Down Expand Up @@ -1428,6 +1429,49 @@ var _ = Describe("Operator", Label("e2e", "pr"), func() {
})
})

// TODO also test container level security context change
When("replacing Pods due to securityContext changes", func() {
var initialPods *corev1.PodList
var originalPodSpec, modifiedPodSpec *corev1.PodSpec

BeforeEach(func() {
originalPodSpec = fdbCluster.GetPodTemplateSpec(fdbv1beta2.ProcessClassLog)
modifiedPodSpec = originalPodSpec.DeepCopy()
modifiedPodSpec.SecurityContext.FSGroupChangePolicy = &[]corev1.PodFSGroupChangePolicy{corev1.FSGroupChangeOnRootMismatch}[0]
initialPods = fdbCluster.GetLogPods()
Expect(fdbCluster.SetPodTemplateSpec(fdbv1beta2.ProcessClassLog, modifiedPodSpec, false)).NotTo(HaveOccurred())
})

AfterEach(func() {
Expect(fdbCluster.SetPodTemplateSpec(fdbv1beta2.ProcessClassLog, originalPodSpec, false)).NotTo(HaveOccurred())
// TODO maybe test that it does the same for the revert?
})

It("should replace all the log pods (which had the security context change)", func() {
lastForcedReconciliationTime := time.Now()
forceReconcileDuration := 4 * time.Minute
Eventually(func() bool {
// Force a reconcile if needed to make sure we speed up the reconciliation if needed.
if time.Since(lastForcedReconciliationTime) >= forceReconcileDuration {
fdbCluster.ForceReconcile()
lastForcedReconciliationTime = time.Now()
}

// check that all original pods are gone
pods := fdbCluster.GetLogPods()
Expect(pods.Items).NotTo(ContainElements(initialPods.Items))
// TODO maybe instead check if all logs are marked for removal? unclear how long *all* would take?
//for _, processGroup := range fdbCluster.GetCluster().Status.ProcessGroups {
// if processGroup.IsMarkedForRemoval() && processGroup.IsExcluded() {
// continue
// }
// <fail>
//}
return true
}).WithTimeout(20 * time.Minute).WithPolling(5 * time.Second).Should(BeTrue())
})
})

When("migrating a cluster to make use of DNS in the cluster file", func() {
BeforeEach(func() {
cluster := fdbCluster.GetCluster()
Expand Down
22 changes: 13 additions & 9 deletions internal/replacements/replacements.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ import (
)

// ReplaceMisconfiguredProcessGroups checks if the cluster has any misconfigured process groups that must be replaced.
func ReplaceMisconfiguredProcessGroups(ctx context.Context, podManager podmanager.PodLifecycleManager, client client.Client, log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, pvcMap map[fdbv1beta2.ProcessGroupID]corev1.PersistentVolumeClaim) (bool, error) {
func ReplaceMisconfiguredProcessGroups(ctx context.Context, podManager podmanager.PodLifecycleManager, client client.Client, log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, pvcMap map[fdbv1beta2.ProcessGroupID]corev1.PersistentVolumeClaim, replaceOnSecurityContextChange bool) (bool, error) {
hasReplacements := false

maxReplacements, _ := getReplacementInformation(cluster, cluster.GetMaxConcurrentReplacements())
Expand All @@ -54,7 +54,7 @@ func ReplaceMisconfiguredProcessGroups(ctx context.Context, podManager podmanage
continue
}

needsRemoval, err := ProcessGroupNeedsRemoval(ctx, podManager, client, log, cluster, processGroup, pvcMap)
needsRemoval, err := ProcessGroupNeedsRemoval(ctx, podManager, client, log, cluster, processGroup, pvcMap, replaceOnSecurityContextChange)

// Do not mark for removal if there is an error
if err != nil {
Expand All @@ -72,7 +72,7 @@ func ReplaceMisconfiguredProcessGroups(ctx context.Context, podManager podmanage
}

// ProcessGroupNeedsRemoval checks if a process group needs to be removed.
func ProcessGroupNeedsRemoval(ctx context.Context, podManager podmanager.PodLifecycleManager, client client.Client, log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, processGroup *fdbv1beta2.ProcessGroupStatus, pvcMap map[fdbv1beta2.ProcessGroupID]corev1.PersistentVolumeClaim) (bool, error) {
func ProcessGroupNeedsRemoval(ctx context.Context, podManager podmanager.PodLifecycleManager, client client.Client, log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, processGroup *fdbv1beta2.ProcessGroupStatus, pvcMap map[fdbv1beta2.ProcessGroupID]corev1.PersistentVolumeClaim, replaceOnSecurityContextChange bool) (bool, error) {
// TODO(johscheuer): Fix how we fetch the pvc to make better use of the controller runtime cache.
pvc, hasPVC := pvcMap[processGroup.ProcessGroupID]
pod, podErr := podManager.GetPod(ctx, client, cluster, processGroup.GetPodName(cluster))
Expand All @@ -96,7 +96,7 @@ func ProcessGroupNeedsRemoval(ctx context.Context, podManager podmanager.PodLife
return false, podErr
}

return processGroupNeedsRemovalForPod(cluster, pod, processGroup, log)
return processGroupNeedsRemovalForPod(cluster, pod, processGroup, log, replaceOnSecurityContextChange)
}

func processGroupNeedsRemovalForPVC(cluster *fdbv1beta2.FoundationDBCluster, pvc corev1.PersistentVolumeClaim, log logr.Logger, processGroup *fdbv1beta2.ProcessGroupStatus) (bool, error) {
Expand Down Expand Up @@ -140,7 +140,7 @@ func processGroupNeedsRemovalForPVC(cluster *fdbv1beta2.FoundationDBCluster, pvc
return false, nil
}

func processGroupNeedsRemovalForPod(cluster *fdbv1beta2.FoundationDBCluster, pod *corev1.Pod, processGroupStatus *fdbv1beta2.ProcessGroupStatus, log logr.Logger) (bool, error) {
func processGroupNeedsRemovalForPod(cluster *fdbv1beta2.FoundationDBCluster, pod *corev1.Pod, processGroupStatus *fdbv1beta2.ProcessGroupStatus, log logr.Logger, replaceOnSecurityContextChange bool) (bool, error) {
if pod == nil {
return false, nil
}
Expand Down Expand Up @@ -251,10 +251,14 @@ func processGroupNeedsRemovalForPod(cluster *fdbv1beta2.FoundationDBCluster, pod
if err != nil {
return false, err
}
// TODO deprecated builtin k8s features edited securityContext automatically, and it doesn't seem outlandish that someone's cluster
// could use it or a similar feature, and it would result in constant replacements with no solution unless we feature
// guard this... (https://kubernetes.io/blog/2021/04/06/podsecuritypolicy-deprecation-past-present-and-future/)
return fileSecurityContextChanged(desiredPod, pod, log), nil
// Some k8s instances have security context vetting which may edit the spec automatically.
// This would cause changes to security context on a pod or container
// to constantly be seen as having a security context change, hence we want to feature guard this.
// https://kubernetes.io/blog/2021/04/06/podsecuritypolicy-deprecation-past-present-and-future/
if replaceOnSecurityContextChange {
return fileSecurityContextChanged(desiredPod, pod, log), nil
}
return false, nil
}

func resourcesNeedsReplacement(desired []corev1.Container, current []corev1.Container) bool {
Expand Down
Loading

0 comments on commit 9509765

Please sign in to comment.