Skip to content

Commit

Permalink
feat: detect frequently restarting pods (#71)
Browse files Browse the repository at this point in the history
* feat: detect frequently restarting pods

* chore: add test case for high restarts but a long time ago

* fix: time limit of restarts
  • Loading branch information
adityathebe committed Jul 18, 2024
1 parent 921c352 commit 9680abe
Show file tree
Hide file tree
Showing 4 changed files with 356 additions and 12 deletions.
15 changes: 15 additions & 0 deletions pkg/health/health_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,21 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) {
health = msg.Health
status = msg.Status
messages = append(messages, msg.Message)
} else if containerStatus.RestartCount > 2 && containerStatus.LastTerminationState.Terminated != nil {
lastRestarted := containerStatus.LastTerminationState.Terminated.FinishedAt.Time
if time.Since(lastRestarted) < time.Minute*30 {
return &HealthStatus{
Health: HealthUnhealthy,
Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason),
Message: strings.Join(messages, ", "),
}, nil
} else if time.Since(lastRestarted) < time.Hour*8 {
return &HealthStatus{
Health: HealthWarning,
Status: HealthStatusCode(containerStatus.LastTerminationState.Terminated.Reason),
Message: strings.Join(messages, ", "),
}, nil
}
}
}

Expand Down
40 changes: 28 additions & 12 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ package health_test

import (
"os"
"strings"
"testing"
"time"

"github.com/flanksource/is-healthy/pkg/health"
"github.com/flanksource/is-healthy/pkg/lua"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"sigs.k8s.io/yaml"
)
Expand All @@ -26,28 +26,27 @@ func assertAppHealth(t *testing.T, yamlPath string, expectedStatus health.Health
assert.Equal(t, expectedStatus, health.Status)
}

func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]any, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) {
func assertAppHealthWithOverwrite(t *testing.T, yamlPath string, overwrites map[string]string, expectedStatus health.HealthStatusCode, expectedHealth health.Health, expectedReady bool) {
health := getHealthStatus(yamlPath, t, overwrites)
assert.NotNil(t, health)
assert.Equal(t, expectedHealth, health.Health)
assert.Equal(t, expectedReady, health.Ready)
assert.Equal(t, expectedStatus, health.Status)
}

func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]any) *health.HealthStatus {
func getHealthStatus(yamlPath string, t *testing.T, overwrites map[string]string) *health.HealthStatus {
yamlBytes, err := os.ReadFile(yamlPath)
require.NoError(t, err)
var obj unstructured.Unstructured
err = yaml.Unmarshal(yamlBytes, &obj)
require.NoError(t, err)

// Basic, search & replace overwrite
for k, v := range overwrites {
switch k {
case "deletionTimestamp":
obj.SetDeletionTimestamp(v.(*v1.Time))
}
yamlBytes = []byte(strings.ReplaceAll(string(yamlBytes), k, v))
}

var obj unstructured.Unstructured
err = yaml.Unmarshal(yamlBytes, &obj)
require.NoError(t, err)

health, err := health.GetResourceHealth(&obj, lua.ResourceHealthOverrides{})
require.NoError(t, err)
return health
Expand Down Expand Up @@ -144,6 +143,23 @@ func TestHPA(t *testing.T) {
}

func TestPod(t *testing.T) {
// Less than 30 minutes
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, "OOMKilled", health.HealthUnhealthy, false)

// Less than 8 hours
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": time.Now().UTC().Add(-time.Hour).Format("2006-01-02T15:04:05Z"),
}, "OOMKilled", health.HealthWarning, false)

// More than 8 hours
assertAppHealthWithOverwrite(t, "./testdata/pod-high-restart-count.yaml", map[string]string{
"2024-07-17T14:29:51Z": "2024-06-17T14:29:51Z",
}, health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealth(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false)
status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil)
assert.Contains(t, status.Message, "stuck in 'Terminating' for")
Expand All @@ -160,8 +176,8 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]any{
"deletionTimestamp": &v1.Time{Time: time.Now().Add(-time.Minute)},
assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)
}

Expand Down
170 changes: 170 additions & 0 deletions pkg/health/testdata/pod-high-restart-count.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
apiVersion: v1
kind: Pod
metadata:
uid: ba54f218-3435-464c-9f37-75ac7d76104a
name: config-db-5867b6596f-gs79g
labels:
control-plane: config-db
pod-template-hash: 5867b6596f
app.kubernetes.io/name: config-db
app.kubernetes.io/instance: mission-control
namespace: mission-control
generateName: config-db-5867b6596f-
ownerReferences:
- uid: 00427427-8dee-4003-bd4e-496b0cc275d1
kind: ReplicaSet
name: config-db-5867b6596f
apiVersion: apps/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-07-16T13:31:23Z
spec:
volumes:
- name: aws-iam-token
projected:
sources:
- serviceAccountToken:
path: token
audience: sts.amazonaws.com
expirationSeconds: 86400
defaultMode: 420
- name: kube-api-access-7jmn6
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
nodeName: ip-10-0-6-40.eu-west-1.compute.internal
priority: 0
dnsPolicy: ClusterFirst
containers:
- env:
- name: DB_URL
valueFrom:
secretKeyRef:
key: DB_URL
name: incident-commander-postgres
- name: NAMESPACE
value: mission-control
- name: AWS_STS_REGIONAL_ENDPOINTS
value: regional
- name: AWS_DEFAULT_REGION
value: eu-west-1
- name: AWS_REGION
value: eu-west-1
- name: AWS_ROLE_ARN
value: arn:aws:iam::765618022540:role/eksctl-config-db-sa
- name: AWS_WEB_IDENTITY_TOKEN_FILE
value: /var/run/secrets/eks.amazonaws.com/serviceaccount/token
args:
- operator
- --disable-postgrest=true
- --change-retention-days=60
- --analysis-retention-days=60
- --json-logs
- --otel-collector-url=grafana-alloy.monitoring:4317
- --otel-service-name=config-db
name: config-db
image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400
command:
- /app/config-db
resources:
limits:
cpu: 500m
memory: 4Gi
requests:
cpu: 200m
memory: 1Gi
volumeMounts:
- name: kube-api-access-7jmn6
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
- name: aws-iam-token
readOnly: true
mountPath: /var/run/secrets/eks.amazonaws.com/serviceaccount
livenessProbe:
httpGet:
path: /live
port: 8080
scheme: HTTP
periodSeconds: 10
timeoutSeconds: 1
failureThreshold: 3
successThreshold: 1
readinessProbe:
httpGet:
path: /ready
port: 8080
scheme: HTTP
periodSeconds: 10
timeoutSeconds: 1
failureThreshold: 3
successThreshold: 1
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: config-db-sa
securityContext:
fsGroup: 1000
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: true
serviceAccountName: config-db-sa
terminationGracePeriodSeconds: 30
status:
phase: Running
podIP: 10.0.6.38
hostIP: 10.0.6.40
podIPs:
- ip: 10.0.6.38
qosClass: Burstable
startTime: 2024-07-16T13:31:23Z
conditions:
- type: Initialized
status: "True"
- type: Ready
status: "True"
- type: ContainersReady
status: "True"
- type: PodScheduled
status: "True"
containerStatuses:
- name: config-db
image: public.ecr.aws/k4y9r6y5/config-db:v0.0.400
ready: true
state:
running:
startedAt: 2024-07-17T14:29:52Z
imageID: public.ecr.aws/k4y9r6y5/config-db@sha256:b8803113097931662bda448b53c6ca256957957d74d5e8fd1fc442cec197b025
started: true
lastState:
terminated:
reason: OOMKilled
exitCode: 137
startedAt: 2024-07-17T14:13:28Z
finishedAt: 2024-07-17T14:29:51Z
containerID: containerd://ee5467962528e4a836dfb48cf9f23d7d547eb7e17cb0e96b9ebf698b05d04420
containerID: containerd://68dab40e5ad9d6a66477dd2d388e2d7bf37607743b48c0aa454a623d4fa7f7a7
restartCount: 101
Loading

0 comments on commit 9680abe

Please sign in to comment.