Skip to content

Commit

Permalink
UPSTREAM: 124795: e2e: DaemonSet maxSurge test should account for ter…
Browse files Browse the repository at this point in the history
…minated pods

that are terminated by the test
  • Loading branch information
atiratree authored and openshift-cherrypick-robot committed Jun 12, 2024
1 parent a2c84a5 commit f3a0bbd
Showing 1 changed file with 26 additions and 11 deletions.
37 changes: 26 additions & 11 deletions test/e2e/apps/daemon_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -585,10 +585,12 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
framework.ExpectNoError(err)
nodeCount := len(nodes.Items)
retryTimeout := dsRetryTimeout + time.Duration(nodeCount*30)*time.Second
// We disturb daemonset progress by randomly terminating pods.
randomPodTerminationTimeout := 5 * time.Minute
retryTimeout := dsRetryTimeout + randomPodTerminationTimeout + time.Duration(nodeCount*30)*time.Second

ginkgo.By("Check that daemon pods surge and invariants are preserved during that rollout")
ageOfOldPod := make(map[string]time.Time)
nodeToAgeOfOldPod := make(map[string]map[string]time.Time)
deliberatelyDeletedPods := sets.NewString()
err = wait.PollImmediateWithContext(ctx, dsRetryPeriod, retryTimeout, func(ctx context.Context) (bool, error) {
podList, err := c.CoreV1().Pods(ds.Namespace).List(ctx, metav1.ListOptions{})
Expand Down Expand Up @@ -682,17 +684,25 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
// if this is a pod in an older version AND there is a new version of this pod, record when
// we started seeing this, otherwise delete the record (perhaps the node was drained)
if nodesToVersions[pod.Spec.NodeName][newVersion] > 0 {
if _, ok := ageOfOldPod[string(pod.UID)]; !ok {
ageOfOldPod[string(pod.UID)] = now
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)]; !ok {
if _, ok := nodeToAgeOfOldPod[pod.Spec.NodeName]; !ok {
nodeToAgeOfOldPod[pod.Spec.NodeName] = make(map[string]time.Time)
}
nodeToAgeOfOldPod[pod.Spec.NodeName][string(pod.UID)] = now
}
} else {
delete(ageOfOldPod, string(pod.UID))
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
}
}
// purge the old pods list of any deleted pods
for uid := range ageOfOldPod {
if !podUIDs.Has(uid) {
delete(ageOfOldPod, uid)
for node, uidToTime := range nodeToAgeOfOldPod {
for uid := range uidToTime {
if !podUIDs.Has(uid) {
delete(uidToTime, uid)
}
}
if len(uidToTime) == 0 {
delete(nodeToAgeOfOldPod, node)
}
}
deliberatelyDeletedPods = deliberatelyDeletedPods.Intersection(deletedPodUIDs)
Expand All @@ -713,9 +723,11 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
}

// invariant: the controller must react to the new pod becoming ready within a reasonable timeframe (2x grace period)
for uid, firstSeen := range ageOfOldPod {
if now.Sub(firstSeen) > maxSurgeOverlap {
errs = append(errs, fmt.Sprintf("An old pod with UID %s has been running alongside a newer version for longer than %s", uid, maxSurgeOverlap))
for node, uidToTime := range nodeToAgeOfOldPod {
for uid, firstSeenSinceNewVersionPod := range uidToTime {
if now.Sub(firstSeenSinceNewVersionPod) > maxSurgeOverlap {
errs = append(errs, fmt.Sprintf("An old pod with UID %s on a node %s has been running alongside a newer version for longer than %s", uid, node, maxSurgeOverlap))
}
}
}

Expand Down Expand Up @@ -800,6 +812,9 @@ var _ = SIGDescribe("Daemon set [Serial]", func() {
} else {
framework.Logf("Deleted pod %s prematurely", pod.Name)
deliberatelyDeletedPods.Insert(string(pod.UID))
// If it is an old version we do not need to measure the controller reaction because we have done it instead.
// If it is a new version, we have to reset the time to start counting the time for the replacement pod to reach readiness again.
delete(nodeToAgeOfOldPod, pod.Spec.NodeName)
}
}
}
Expand Down

0 comments on commit f3a0bbd

Please sign in to comment.