Skip to content

Commit

Permalink
feat: pod health | restart loop & not ready (#75)
Browse files Browse the repository at this point in the history
* feat: pod health | restart loop & not ready

* Update pkg/health/health_pod.go

Co-authored-by: Moshe Immerman <moshe@flanksource.com>

* feat: container readiness probe waiting

---------

Co-authored-by: Moshe Immerman <moshe@flanksource.com>
  • Loading branch information
adityathebe and moshloop authored Jul 29, 2024
1 parent 9680abe commit 1916ba9
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 4 deletions.
63 changes: 59 additions & 4 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"strings"
"time"

"github.com/samber/lo"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
Expand Down Expand Up @@ -174,17 +175,45 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
// if pod is ready, it is automatically healthy
if isReady {
health := HealthHealthy
message := pod.Status.Message

// A ready pod can be in a warning state if it has been in a restart loop.
// i.e. the container completes successfully, but the pod keeps restarting.
for _, s := range pod.Status.ContainerStatuses {
if s.LastTerminationState.Terminated != nil {
lastTerminatedTime := s.LastTerminationState.Terminated.FinishedAt.Time
if !lastTerminatedTime.IsZero() && pod.Status.StartTime.Sub(lastTerminatedTime) < time.Hour {
health = HealthWarning
message = fmt.Sprintf("%s has restarted %d time(s)", s.Name, pod.Status.ContainerStatuses[0].RestartCount)
}

break
}
}

return &HealthStatus{
Health: HealthHealthy,
Health: health,
Ready: true,
Status: HealthStatusRunning,
Message: pod.Status.Message,
Message: message,
}, nil
}

// if it's not ready, check to see if any container terminated, if so, it's degraded
var nonReadyContainers []ContainerRecord
for _, ctrStatus := range pod.Status.ContainerStatuses {
if !ctrStatus.Ready {
spec := lo.Filter(pod.Spec.Containers, func(i corev1.Container, _ int) bool {
return i.Name == ctrStatus.Name
})
nonReadyContainers = append(nonReadyContainers, ContainerRecord{
Status: ctrStatus,
Spec: spec[0],
})
}

if ctrStatus.LastTerminationState.Terminated != nil {
return &HealthStatus{
Health: HealthUnhealthy,
Expand All @@ -194,11 +223,32 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
}, nil
}
}

// Pod isn't ready but all containers are
if len(nonReadyContainers) == 0 {
return &HealthStatus{
Health: HealthWarning,
Status: HealthStatusRunning,
Message: pod.Status.Message,
}, nil
}

var containersWaitingForReadinessProbe []string
for _, c := range nonReadyContainers {
if c.Spec.ReadinessProbe == nil || c.Spec.ReadinessProbe.InitialDelaySeconds == 0 {
continue
}

if time.Since(c.Status.State.Running.StartedAt.Time) <= time.Duration(c.Spec.ReadinessProbe.InitialDelaySeconds)*time.Second {
containersWaitingForReadinessProbe = append(containersWaitingForReadinessProbe, c.Spec.Name)
}
}

// otherwise we are progressing towards a ready state
return &HealthStatus{
Health: HealthUnknown,
Status: HealthStatusStarting,
Message: pod.Status.Message,
Message: fmt.Sprintf("Container %s is waiting for readiness probe", strings.Join(containersWaitingForReadinessProbe, ",")),
}, nil

case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
Expand All @@ -222,3 +272,8 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
Message: pod.Status.Message,
}, nil
}

type ContainerRecord struct {
Spec corev1.Container
Status corev1.ContainerStatus
}
22 changes: 22 additions & 0 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.Health
assert.Equal(t, expectedStatus, health.Status)
}

func assertAppHealthWithOverwriteMsg(t *testing.T, yamlPath string, overwrites map[string]string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool, expectedMsg string) {
health := getHealthStatus(yamlPath, t, overwrites)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
assert.Equal(t, expectedStatus, health.Status)
assert.Equal(t, expectedMsg, health.Message)
}

func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) {
health := getHealthStatus(yamlPath, t, overwrites)
assert.NotNil(t, health)
Expand Down Expand Up @@ -143,6 +152,19 @@ func TestHPA(t *testing.T) {
}

func TestPod(t *testing.T) {
assertAppHealthWithOverwriteMsg(t, "./testdata/pod-not-ready-container-not-ready.yaml", map[string]string{
"2024-07-29T06:32:56Z": time.Now().Add(time.Minute * 10).Format(time.RFC3339),
}, health.HealthStatusStarting, health.HealthUnknown, false, "Container nginx is waiting for readiness probe")

// Pod not ready
assertAppHealth(t, "./testdata/pod-not-ready-but-container-ready.yaml", health.HealthStatusRunning, health.HealthWarning, false)

// Restart Loop
assertAppHealth(t, "./testdata/pod-ready-container-terminated.yaml", health.HealthStatusRunning, health.HealthWarning, true)
assertAppHealthWithOverwrite(t, "./testdata/pod-ready-container-terminated.yaml", map[string]string{
"2024-07-18T12:03:16Z": "2024-07-18T12:05:16Z",
}, health.HealthStatusRunning, health.HealthWarning, true)

// Less than 30 minutes
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
Expand Down
86 changes: 86 additions & 0 deletions pkg/health/testdata/pod-not-ready-but-container-ready.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
apiVersion: v1
kind: Pod
metadata:
uid: d21e1521-5a3f-4120-a446-bd7426199a20
name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
labels:
app: postgresql
controller-revision-hash: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-988f9fc65
statefulset.kubernetes.io/pod-name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-0
namespace: httpbin
generateName: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc-
ownerReferences:
- uid: da460101-eebb-4d4f-b8f4-acb8908d7083
kind: StatefulSet
name: postgresql-01902bbe-eb40-47d4-a0f7-0afb993645dc
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-06-18T14:48:55Z
spec:
restartPolicy: Always
readiness:
initialDelaySeconds: 10
containers:
- name: busybox
image: busybox
command: ['sh', '-c', 'echo Hello Kubernetes! && sleep 3600']
restartPolicy: Always
resources:
requests:
memory: "64Mi"
cpu: "250m"
limits:
memory: "128Mi"
cpu: "500m"

nodeName: some-node
status:
phase: Running
podIP: 10.0.179.6
hostIP: 10.0.179.6
podIPs:
- ip: 10.0.179.6
hostIPs:
- ip: 10.0.179.6
qosClass: BestEffort
startTime: 2024-07-17T06:58:33Z
conditions:
- type: PodReadyToStartContainers
status: 'True'
- type: Initialized
status: 'True'
- type: Ready
status: 'False'
- type: ContainersReady
status: 'True'
- type: PodScheduled
status: 'True'
containerStatuses:
- name: eks-pod-identity-agent
image: 602401143452.dkr.ecr.eu-west-1.amazonaws.com/eks/eks-pod-identity-agent:0.1.10
ready: true
state:
running:
startedAt: 2024-07-17T06:58:43Z
imageID: 602401143452.dkr.ecr.eu-west-1.amazonaws.com/eks/eks-pod-identity-agent@sha256:9cf48154b1603963d449f80ed8bc150918509ac025e92cdd67748a9f3f8ad367
started: true
lastState: {}
containerID: containerd://8e48835f5f3765827aed1470ef5a95312345c52a1a5c3db24ff800c8e10a6424
restartCount: 0
initContainerStatuses:
- name: eks-pod-identity-agent-init
image: 602401143452.dkr.ecr.eu-west-1.amazonaws.com/eks/eks-pod-identity-agent:0.1.10
ready: true
state:
terminated:
reason: Completed
exitCode: 0
startedAt: 2024-07-17T06:58:37Z
finishedAt: 2024-07-17T06:58:37Z
containerID: containerd://16032f42f71448ef31794b6805cb69f25cc4e59fd83622d3fa9b3acd00a3867c
imageID: 602401143452.dkr.ecr.eu-west-1.amazonaws.com/eks/eks-pod-identity-agent@sha256:9cf48154b1603963d449f80ed8bc150918509ac025e92cdd67748a9f3f8ad367
started: false
lastState: {}
containerID: containerd://16032f42f71448ef31794b6805cb69f25cc4e59fd83622d3fa9b3acd00a3867c
restartCount: 0
123 changes: 123 additions & 0 deletions pkg/health/testdata/pod-not-ready-container-not-ready.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
apiVersion: v1
kind: Pod
metadata:
creationTimestamp: '2024-07-29T06:32:55Z'
generateName: nginx-deployment-559c4fc45b-
labels:
app: nginx
pod-template-hash: 559c4fc45b
name: nginx-deployment-559c4fc45b-xdlmh
namespace: default
ownerReferences:
- apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: nginx-deployment-559c4fc45b
uid: 28b8d6dd-1ffe-4ede-8e24-0bafa3bb71e3
resourceVersion: '68571321'
uid: 9d304944-d50a-4b8a-971a-7fcad1a4ca9f
spec:
containers:
- image: nginx:alpine
imagePullPolicy: IfNotPresent
name: nginx
ports:
- containerPort: 80
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /
port: 80
scheme: HTTP
initialDelaySeconds: 300
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 100m
memory: 128Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-ssdbh
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: saka
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: kube-api-access-ssdbh
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastTransitionTime: '2024-07-29T06:32:55Z'
status: 'True'
type: Initialized
- lastTransitionTime: '2024-07-29T06:32:55Z'
message: 'containers with unready status: [nginx]'
reason: ContainersNotReady
status: 'False'
type: Ready
- lastTransitionTime: '2024-07-29T06:32:55Z'
message: 'containers with unready status: [nginx]'
reason: ContainersNotReady
status: 'False'
type: ContainersReady
- lastTransitionTime: '2024-07-29T06:32:55Z'
status: 'True'
type: PodScheduled
containerStatuses:
- containerID: containerd://01d5d0ded389b8dffa95f85fc8cb9a8a1bf916beae74a75dd7a62bc10dabef01
image: docker.io/library/nginx:alpine
imageID: docker.io/library/nginx@sha256:208b70eefac13ee9be00e486f79c695b15cef861c680527171a27d253d834be9
lastState: {}
name: nginx
ready: false
restartCount: 0
started: true
state:
running:
startedAt: '2024-07-29T06:32:56Z'
hostIP: 10.99.99.8
phase: Running
podIP: 10.42.2.27
podIPs:
- ip: 10.42.2.27
qosClass: BestEffort
startTime: '2024-07-29T06:32:55Z'
Loading

0 comments on commit 1916ba9

Please sign in to comment.