From dbb5ac8ffe5d79ac24d9b6211fa153a7b3d0d9db Mon Sep 17 00:00:00 2001 From: Miciah Masters Date: Fri, 4 Nov 2022 11:09:31 -0400 Subject: [PATCH 1/2] Absorb PodsScheduled condition into MinAvailable Remove the PodsScheduled status condition on IngressController objects, and report the information that PodsScheduled reported instead in the DeploymentReplicasMinAvailable status condition's message when its status is "False". Before this commit, the PodsScheduled status condition could erroneously signal that there was a problem when some misscheduled pods existed even if there were enough correctly scheduled and available pods. The PodsScheduled status condition was added in order to provide diagnostic information when there was a problem. That is, PodsScheduled was intended to indicate *why* there was a problem, not *whether* there was a problem. The DeploymentReplicasMinAvailable status condition definitively indicates *whether* there is a problem, and its status message should indicate *why* there is a problem. Having a separate status condition for the "why" is superfluous and unnecessarily complicated in this instance. Follow-up to commit c5e0f4a513a327a42fb4a184ca1b1b9b7f39f505. This bug fixes OCPBUGS-434. https://issues.redhat.com/browse/OCPBUGS-434 * pkg/operator/controller/ingress/controller.go (IngressControllerPodsScheduledConditionType): Delete const. * pkg/operator/controller/ingress/metrics_test.go (TestDeleteIngressControllerConditionsMetric): Delete a reference to the "PodsScheduled" status condition in a test case. * pkg/operator/controller/ingress/status.go (syncIngressControllerStatus): Delete the call to computeDeploymentPodsScheduledCondition. Pass the list of pods to computeDeploymentReplicasMinAvailableCondition. (PruneConditions): Delete pruning of the "DeploymentDegraded" status condition (which was removed in 4.6.0) and instead prune the "PodsScheduled" status condition. (computeDeploymentPodsScheduledCondition): Rename... (checkPodsScheduledForDeployment): ...to this. Change the return value from an operator status condition to an error value. (computeDeploymentReplicasMinAvailableCondition): Call checkPodsScheduledForDeployment and include the error string in the status condition's message if the function returns an error. (computeIngressDegradedCondition): Remove IngressControllerPodsScheduledConditionType from the list of conditions to check. * pkg/operator/controller/ingress/status_test.go (TestComputePodsScheduledCondition): Rename... (Test_checkPodsScheduledForDeployment): ...to this. Update for the function's new name and result value. (TestComputeIngressDegradedCondition): Delete test cases for PodsScheduled. (TestComputeDeploymentReplicasMinAvailableCondition): Update to set the deployment's label selector and pass in a list of pods (which is empty). --- pkg/operator/controller/ingress/controller.go | 1 - .../controller/ingress/metrics_test.go | 1 - pkg/operator/controller/ingress/status.go | 50 ++++------- .../controller/ingress/status_test.go | 90 ++++++++----------- 4 files changed, 57 insertions(+), 85 deletions(-) diff --git a/pkg/operator/controller/ingress/controller.go b/pkg/operator/controller/ingress/controller.go index b88089080..cb4e7c088 100644 --- a/pkg/operator/controller/ingress/controller.go +++ b/pkg/operator/controller/ingress/controller.go @@ -50,7 +50,6 @@ const ( // TODO: consider moving these to openshift/api const ( IngressControllerAdmittedConditionType = "Admitted" - IngressControllerPodsScheduledConditionType = "PodsScheduled" IngressControllerDeploymentAvailableConditionType = "DeploymentAvailable" IngressControllerDeploymentReplicasMinAvailableConditionType = "DeploymentReplicasMinAvailable" IngressControllerDeploymentReplicasAllAvailableConditionType = "DeploymentReplicasAllAvailable" diff --git a/pkg/operator/controller/ingress/metrics_test.go b/pkg/operator/controller/ingress/metrics_test.go index b8f8ac286..d9811093d 100644 --- a/pkg/operator/controller/ingress/metrics_test.go +++ b/pkg/operator/controller/ingress/metrics_test.go @@ -77,7 +77,6 @@ func TestDeleteIngressControllerConditionsMetric(t *testing.T) { {Type: "Available", Status: operatorv1.ConditionFalse}, {Type: "Degraded", Status: operatorv1.ConditionTrue}, {Type: "Admitted", Status: operatorv1.ConditionTrue}, - {Type: "PodsScheduled", Status: operatorv1.ConditionTrue}, }), expectedMetricFormat: ` # HELP ingress_controller_conditions Report the conditions for ingress controllers. 0 is False and 1 is True. diff --git a/pkg/operator/controller/ingress/status.go b/pkg/operator/controller/ingress/status.go index 7195b18f8..0c866d540 100644 --- a/pkg/operator/controller/ingress/status.go +++ b/pkg/operator/controller/ingress/status.go @@ -75,9 +75,8 @@ func (r *reconciler) syncIngressControllerStatus(ic *operatorv1.IngressControlle updated.Status.EndpointPublishingStrategy.LoadBalancer.AllowedSourceRanges = computeAllowedSourceRanges(service) } - updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentPodsScheduledCondition(deployment, pods)) updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentAvailableCondition(deployment)) - updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentReplicasMinAvailableCondition(deployment)) + updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentReplicasMinAvailableCondition(deployment, pods)) updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentReplicasAllAvailableCondition(deployment)) updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeDeploymentRollingOutCondition(deployment)) updated.Status.Conditions = MergeConditions(updated.Status.Conditions, computeLoadBalancerStatus(ic, service, operandEvents)...) @@ -153,9 +152,9 @@ func MergeConditions(conditions []operatorv1.OperatorCondition, updates ...opera // Returns the updated condition array. func PruneConditions(conditions []operatorv1.OperatorCondition) []operatorv1.OperatorCondition { for i, condition := range conditions { - // TODO: Remove this fix-up logic in 4.8 - if condition.Type == "DeploymentDegraded" { - // DeploymentDegraded was removed in 4.6.0 + // PodsScheduled was removed in 4.13.0. + // TODO: Remove this fix-up logic in 4.14. + if condition.Type == "PodsScheduled" { conditions = append(conditions[:i], conditions[i+1:]...) } } @@ -204,18 +203,14 @@ func computeAllowedSourceRanges(service *corev1.Service) []operatorv1.CIDR { return nil } -// computeDeploymentPodsScheduledCondition computes the ingress controller's -// current PodsScheduled status condition state by inspecting the PodScheduled -// conditions of the pods associated with the deployment. -func computeDeploymentPodsScheduledCondition(deployment *appsv1.Deployment, pods []corev1.Pod) operatorv1.OperatorCondition { +// checkPodsScheduledForDeployment checks whether the given deployment has a +// valid label selector and whether all of the deployment's pods are reporting +// PodsScheduled=True. This function returns an error value indicating the +// result of that check. +func checkPodsScheduledForDeployment(deployment *appsv1.Deployment, pods []corev1.Pod) error { selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector) if err != nil || selector.Empty() { - return operatorv1.OperatorCondition{ - Type: IngressControllerPodsScheduledConditionType, - Status: operatorv1.ConditionUnknown, - Reason: "InvalidLabelSelector", - Message: "Deployment has an invalid label selector.", - } + return errors.New("Deployment has an invalid label selector.") } unscheduled := make(map[*corev1.Pod]corev1.PodCondition) for i, pod := range pods { @@ -255,17 +250,9 @@ func computeDeploymentPodsScheduledCondition(deployment *appsv1.Deployment, pods if haveUnschedulable { message = message + " Make sure you have sufficient worker nodes." } - return operatorv1.OperatorCondition{ - Type: IngressControllerPodsScheduledConditionType, - Status: operatorv1.ConditionFalse, - Reason: "PodsNotScheduled", - Message: message, - } - } - return operatorv1.OperatorCondition{ - Type: IngressControllerPodsScheduledConditionType, - Status: operatorv1.ConditionTrue, + return errors.New(message) } + return nil } // computeIngressAvailableCondition computes the ingress controller's current Available status state @@ -411,7 +398,7 @@ func computeDeploymentAvailableCondition(deployment *appsv1.Deployment) operator // update parameters. The "DeploymentReplicasMinAvailable" condition is true if // the number of available replicas is equal to or greater than the number of // desired replicas less the number minimum available. -func computeDeploymentReplicasMinAvailableCondition(deployment *appsv1.Deployment) operatorv1.OperatorCondition { +func computeDeploymentReplicasMinAvailableCondition(deployment *appsv1.Deployment, pods []corev1.Pod) operatorv1.OperatorCondition { replicas := int32(1) if deployment.Spec.Replicas != nil { replicas = *deployment.Spec.Replicas @@ -450,11 +437,15 @@ func computeDeploymentReplicasMinAvailableCondition(deployment *appsv1.Deploymen maxUnavailable = 1 } if int(deployment.Status.AvailableReplicas) < int(replicas)-maxUnavailable { + message := fmt.Sprintf("%d/%d of replicas are available, max unavailable is %d", deployment.Status.AvailableReplicas, replicas, maxUnavailable) + if err := checkPodsScheduledForDeployment(deployment, pods); err != nil { + message = fmt.Sprintf("%s: %s", message, err.Error()) + } return operatorv1.OperatorCondition{ Type: IngressControllerDeploymentReplicasMinAvailableConditionType, Status: operatorv1.ConditionFalse, Reason: "DeploymentMinimumReplicasNotMet", - Message: fmt.Sprintf("%d/%d of replicas are available, max unavailable is %d", deployment.Status.AvailableReplicas, replicas, maxUnavailable), + Message: message, } } @@ -556,11 +547,6 @@ func computeIngressDegradedCondition(conditions []operatorv1.OperatorCondition, condition: IngressControllerAdmittedConditionType, status: operatorv1.ConditionTrue, }, - { - condition: IngressControllerPodsScheduledConditionType, - status: operatorv1.ConditionTrue, - gracePeriod: time.Minute * 10, - }, { condition: IngressControllerDeploymentAvailableConditionType, status: operatorv1.ConditionTrue, diff --git a/pkg/operator/controller/ingress/status_test.go b/pkg/operator/controller/ingress/status_test.go index dc6fb1632..c7006e40a 100644 --- a/pkg/operator/controller/ingress/status_test.go +++ b/pkg/operator/controller/ingress/status_test.go @@ -122,7 +122,7 @@ func cond(t string, status operatorv1.ConditionStatus, reason string, lt time.Ti } } -func TestComputePodsScheduledCondition(t *testing.T) { +func Test_checkPodsScheduledForDeployment(t *testing.T) { deployment := &appsv1.Deployment{ Spec: appsv1.DeploymentSpec{ Selector: &metav1.LabelSelector{ @@ -203,46 +203,50 @@ func TestComputePodsScheduledCondition(t *testing.T) { }, } tests := []struct { - name string - deployment *appsv1.Deployment - pods []corev1.Pod - expect operatorv1.ConditionStatus + name string + deployment *appsv1.Deployment + pods []corev1.Pod + expectError bool }{ { - name: "no pods", - deployment: deployment, - pods: []corev1.Pod{unrelatedPod}, - expect: operatorv1.ConditionTrue, + name: "no pods", + deployment: deployment, + pods: []corev1.Pod{unrelatedPod}, + expectError: false, }, { - name: "all pods scheduled", - deployment: deployment, - pods: []corev1.Pod{scheduledPod}, - expect: operatorv1.ConditionTrue, + name: "all pods scheduled", + deployment: deployment, + pods: []corev1.Pod{scheduledPod}, + expectError: false, }, { - name: "some pod unscheduled", - deployment: deployment, - pods: []corev1.Pod{scheduledPod, unscheduledPod}, - expect: operatorv1.ConditionFalse, + name: "some pod unscheduled", + deployment: deployment, + pods: []corev1.Pod{scheduledPod, unscheduledPod}, + expectError: true, }, { - name: "some pod unschedulable", - deployment: deployment, - pods: []corev1.Pod{scheduledPod, unschedulablePod}, - expect: operatorv1.ConditionFalse, + name: "some pod unschedulable", + deployment: deployment, + pods: []corev1.Pod{scheduledPod, unschedulablePod}, + expectError: true, }, { - name: "deployment with empty label selector", - deployment: invalidDeployment, - expect: operatorv1.ConditionUnknown, + name: "deployment with empty label selector", + deployment: invalidDeployment, + expectError: true, }, } for _, test := range tests { - actual := computeDeploymentPodsScheduledCondition(test.deployment, test.pods) - if actual.Status != test.expect { - t.Errorf("%q: expected %v, got %v", test.name, test.expect, actual.Status) - } + t.Run(test.name, func(t *testing.T) { + switch err := checkPodsScheduledForDeployment(test.deployment, test.pods); { + case err == nil && test.expectError: + t.Error("expected error, got nil") + case err != nil && !test.expectError: + t.Errorf("got unexpected error: %v", err) + } + }) } } @@ -279,26 +283,6 @@ func TestComputeIngressDegradedCondition(t *testing.T) { // Just use the one minute retry duration for this degraded condition expectAfter: time.Minute, }, - { - name: "pods not scheduled for <10m", - conditions: []operatorv1.OperatorCondition{ - cond(IngressControllerPodsScheduledConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Minute*-9)), - }, - expectIngressDegradedStatus: operatorv1.ConditionFalse, - expectRequeue: true, - // Grace period is 10 minutes, subtract the 9 minute spoofed last transition time - expectAfter: time.Minute, - }, - { - name: "pods not scheduled for >10m", - conditions: []operatorv1.OperatorCondition{ - cond(IngressControllerPodsScheduledConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Minute*-31)), - }, - expectIngressDegradedStatus: operatorv1.ConditionTrue, - expectRequeue: true, - // Just use the one minute retry duration for this degraded condition - expectAfter: time.Minute, - }, { name: "deployment unavailable for <30s", conditions: []operatorv1.OperatorCondition{ @@ -426,7 +410,6 @@ func TestComputeIngressDegradedCondition(t *testing.T) { name: "DNS not ready and deployment unavailable", conditions: []operatorv1.OperatorCondition{ cond(IngressControllerAdmittedConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), - cond(IngressControllerPodsScheduledConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentAvailableConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentReplicasMinAvailableConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentReplicasAllAvailableConditionType, operatorv1.ConditionFalse, "", clock.Now().Add(time.Hour*-1)), @@ -444,7 +427,6 @@ func TestComputeIngressDegradedCondition(t *testing.T) { name: "admitted, DNS, LB, and deployment OK", conditions: []operatorv1.OperatorCondition{ cond(IngressControllerAdmittedConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), - cond(IngressControllerPodsScheduledConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentAvailableConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentReplicasMinAvailableConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), cond(IngressControllerDeploymentReplicasAllAvailableConditionType, operatorv1.ConditionTrue, "", clock.Now().Add(time.Hour*-1)), @@ -1060,6 +1042,12 @@ func TestComputeDeploymentReplicasMinAvailableCondition(t *testing.T) { deploy := &appsv1.Deployment{ Spec: appsv1.DeploymentSpec{ Replicas: test.replicas, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "ingresscontroller.operator.openshift.io/deployment-ingresscontroller": "default", + "ingresscontroller.operator.openshift.io/hash": "75678b564c", + }, + }, Strategy: appsv1.DeploymentStrategy{ Type: appsv1.RollingUpdateDeploymentStrategyType, RollingUpdate: test.rollingUpdate, @@ -1070,7 +1058,7 @@ func TestComputeDeploymentReplicasMinAvailableCondition(t *testing.T) { }, } - actual := computeDeploymentReplicasMinAvailableCondition(deploy) + actual := computeDeploymentReplicasMinAvailableCondition(deploy, []corev1.Pod{}) if actual.Status != test.expectDeploymentReplicasMinAvailableStatus { t.Errorf("%q: expected %v, got %v", test.name, test.expectDeploymentReplicasMinAvailableStatus, actual.Status) } From 67b0e9216208406f9ff73f5616f4272d18cb3b50 Mon Sep 17 00:00:00 2001 From: Miciah Masters Date: Tue, 6 Dec 2022 18:46:58 -0500 Subject: [PATCH 2/2] checkPodsScheduledForDeployment: Add nil check checkPodsScheduledForDeployment takes a pointer to a deployment, which should never be nil, so return an error indicating that there is a bug if the deployment is somehow nil. * pkg/operator/controller/ingress/status.go (checkPodsScheduledForDeployment): Check whether deployment is nil and return an error if it is. * pkg/operator/controller/ingress/status_test.go (Test_checkPodsScheduledForDeployment): Add a "nil deployment" test case. --- pkg/operator/controller/ingress/status.go | 3 +++ pkg/operator/controller/ingress/status_test.go | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/pkg/operator/controller/ingress/status.go b/pkg/operator/controller/ingress/status.go index 0c866d540..9bb0a6950 100644 --- a/pkg/operator/controller/ingress/status.go +++ b/pkg/operator/controller/ingress/status.go @@ -208,6 +208,9 @@ func computeAllowedSourceRanges(service *corev1.Service) []operatorv1.CIDR { // PodsScheduled=True. This function returns an error value indicating the // result of that check. func checkPodsScheduledForDeployment(deployment *appsv1.Deployment, pods []corev1.Pod) error { + if deployment == nil { + return errors.New("System error detected: deployment was nil. Please report this issue to Red Hat: https://issues.redhat.com/secure/CreateIssueDetails!init.jspa?pid=12332330&issuetype=1&components=12367900&priority=10300&customfield_12316142=26752") + } selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector) if err != nil || selector.Empty() { return errors.New("Deployment has an invalid label selector.") diff --git a/pkg/operator/controller/ingress/status_test.go b/pkg/operator/controller/ingress/status_test.go index c7006e40a..78da12c3b 100644 --- a/pkg/operator/controller/ingress/status_test.go +++ b/pkg/operator/controller/ingress/status_test.go @@ -208,6 +208,12 @@ func Test_checkPodsScheduledForDeployment(t *testing.T) { pods []corev1.Pod expectError bool }{ + { + name: "nil deployment", + deployment: nil, + pods: []corev1.Pod{scheduledPod}, + expectError: true, + }, { name: "no pods", deployment: deployment,