Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.15] OCPBUGS-35331: UPSTREAM: 124795: e2e: DaemonSet maxSurge test should account for terminated pods #1988

Open
wants to merge 1 commit into
base: release-4.15
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions test/e2e/apps/daemon_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -585,10 +585,12 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
nodeCount := len(nodes.Items)
retryTimeout := dsRetryTimeout + time.Duration(nodeCount*30)*time.Second
// We disturb daemonset progress by randomly terminating pods.
randomPodTerminationTimeout := 5 * time.Minute
retryTimeout := dsRetryTimeout + randomPodTerminationTimeout + time.Duration(nodeCount*30)*time.Second

ginkgo.By("Check that daemon pods surge and invariants are preserved during that rollout")
ageOfOldPod := make(map[string]time.Time)
nodeToAgeOfOldPod := make(map[string]map[string]time.Time)
deliberatelyDeletedPods := sets.NewString()
err = wait.PollImmediateWithContext(ctx, dsRetryPeriod, retryTimeout, func(ctx context.Context) (bool, error) {
podList, err := c.CoreV1().Pods(ds.Namespace).List(ctx, metav1.ListOptions{})
Expand Down Expand Up @@ -682,17 +684,25 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
// if this is a pod in an older version AND there is a new version of this pod, record when
// we started seeing this, otherwise delete the record (perhaps the node was drained)
if nodesToVersions[pod.Spec.NodeName][newVersion] > 0 {
if _, ok := ageOfOldPod[string(pod.UID)]; !ok {
ageOfOldPod[string(pod.UID)] = now
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)]; !ok {
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName]; !ok {
nodeToAgeOfOldPod[pod.Spec.NodeName] = make(map[string]time.Time)
}
nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)] = now
}
} else {
delete(ageOfOldPod, string(pod.UID))
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
}
}
// purge the old pods list of any deleted pods
for uid := range ageOfOldPod {
if !podUIDs.Has(uid) {
delete(ageOfOldPod, uid)
for node, uidToTime := range nodeToAgeOfOldPod {
for uid := range uidToTime {
if !podUIDs.Has(uid) {
delete(uidToTime, uid)
}
}
if len(uidToTime) == 0 {
delete(nodeToAgeOfOldPod, node)
}
}
deliberatelyDeletedPods = deliberatelyDeletedPods.Intersection(deletedPodUIDs)
Expand All @@ -713,9 +723,11 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
}

// invariant: the controller must react to the new pod becoming ready within a reasonable timeframe (2x grace period)
for uid, firstSeen := range ageOfOldPod {
if now.Sub(firstSeen) > maxSurgeOverlap {
errs = append(errs, fmt.Sprintf("An old pod with UID %s has been running alongside a newer version for longer than %s", uid, maxSurgeOverlap))
for node, uidToTime := range nodeToAgeOfOldPod {
for uid, firstSeenSinceNewVersionPod := range uidToTime {
if now.Sub(firstSeenSinceNewVersionPod) > maxSurgeOverlap {
errs = append(errs, fmt.Sprintf("An old pod with UID %s on a node %s has been running alongside a newer version for longer than %s", uid, node, maxSurgeOverlap))
}
}
}

Expand Down Expand Up @@ -800,6 +812,9 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
} else {
framework.Logf("Deleted pod %s prematurely", pod.Name)
deliberatelyDeletedPods.Insert(string(pod.UID))
// If it is an old version we do not need to measure the controller reaction because we have done it instead.
// If it is a new version, we have to reset the time to start counting the time for the replacement pod to reach readiness again.
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
}
}
}
Expand Down