Skip to content

Commit

Permalink
feat: terminating stalled
Browse files Browse the repository at this point in the history
  • Loading branch information
adityathebe authored and moshloop committed Aug 19, 2024
1 parent 3be957a commit 4fd3150
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 8 deletions.
11 changes: 11 additions & 0 deletions pkg/health/health.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package health

import (
"fmt"
"strings"
"time"

"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -108,6 +110,15 @@ func GetHealthByConfigType(configType string, obj map[string]any) HealthStatus {

// GetResourceHealth returns the health of a k8s resource
func GetResourceHealth(obj *unstructured.Unstructured, healthOverride HealthOverride) (health *HealthStatus, err error) {
if obj.GetDeletionTimestamp() != nil && !obj.GetDeletionTimestamp().IsZero() && time.Since(obj.GetDeletionTimestamp().Time) > time.Hour {
terminatingFor := time.Since(obj.GetDeletionTimestamp().Time)
return &HealthStatus{
Status: "TerminatingStalled",
Health: HealthUnhealthy,
Message: fmt.Sprintf("Resource is terminating, time since deletion: %v", terminatingFor),
}, nil
}

if healthCheck := GetHealthCheckFunc(obj.GroupVersionKind()); healthCheck != nil {
if health, err = healthCheck(obj); err != nil {
health = &HealthStatus{
Expand Down
19 changes: 11 additions & 8 deletions pkg/health/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,17 @@ func TestKustomization(t *testing.T) {
}

func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/terminating-stuck.yaml", "TerminatingStalled", health.HealthUnhealthy, false)
assertAppHealth(t, "./testdata/terminating-namespace.yaml", "TerminatingStalled", health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-terminating.yaml", map[string]string{
"2024-07-01T06:52:22Z": time.Now().Add(-time.Minute * 20).UTC().Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthWarning, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)

assertAppHealthWithOverwriteMsg(t, "./testdata/pod-not-ready-container-not-ready.yaml", map[string]string{
"2024-07-29T06:32:56Z": time.Now().Add(time.Minute * 10).Format(time.RFC3339),
}, health.HealthStatusStarting, health.HealthUnknown, false, "Container nginx is waiting for readiness probe")
Expand Down Expand Up @@ -201,10 +212,6 @@ func TestPod(t *testing.T) {

assertAppHealth(t, "./testdata/pod-old-restarts.yaml", health.HealthStatusRunning, health.HealthHealthy, true)

assertAppHealth(t, "./testdata/pod-terminating.yaml", health.HealthStatusTerminating, health.HealthWarning, false)
status := getHealthStatus("./testdata/pod-terminating.yaml", t, nil)
assert.Contains(t, status.Message, "stuck in 'Terminating' for")

assertAppHealth(t, "./testdata/pod-pending.yaml", health.HealthStatusPending, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-running-not-ready.yaml", health.HealthStatusStarting, health.HealthUnknown, false)
assertAppHealth(t, "./testdata/pod-crashloop.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)
Expand All @@ -216,10 +223,6 @@ func TestPod(t *testing.T) {
assertAppHealth(t, "./testdata/pod-failed.yaml", health.HealthStatusError, health.HealthUnhealthy, true)
assertAppHealth(t, "./testdata/pod-succeeded.yaml", health.HealthStatusCompleted, health.HealthHealthy, true)
assertAppHealth(t, "./testdata/pod-init-container-fail.yaml", health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, false)

assertAppHealthWithOverwrite(t, "./testdata/pod-deletion.yaml", map[string]string{
"2018-12-03T10:16:04Z": time.Now().Add(-time.Minute).Format("2006-01-02T15:04:05Z"),
}, health.HealthStatusTerminating, health.HealthUnknown, false)
}

// func TestAPIService(t *testing.T) {
Expand Down
6 changes: 6 additions & 0 deletions pkg/health/testdata/terminating-namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: stuck-namespace
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
128 changes: 128 additions & 0 deletions pkg/health/testdata/terminating-stuck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
apiVersion: v1
kind: Pod
metadata:
uid: fce251e7-c46f-4e3b-a9b6-05ad67de270c
name: alert-manager-qzc65
labels: {}
namespace: management
generateName: alert-manager-
ownerReferences:
- uid: 9939fcbb-8ffd-4c51-b01b-bff8092db71d
kind: Job
name: alert-manager
apiVersion: batch/v1
controller: true
blockOwnerDeletion: true
creationTimestamp: 2024-08-09T02:00:00Z
deletionTimestamp: 2024-08-09T03:00:05Z
deletionGracePeriodSeconds: 0
spec:
volumes:
- name: alertmanager-storage
persistentVolumeClaim:
claimName: alertmanager-storage-alertmanager-0
- name: config
configMap:
name: alertmanager-configmap
defaultMode: 420
- name: kube-api-access-2fklr
projected:
sources:
- serviceAccountToken:
path: token
expirationSeconds: 3607
- configMap:
name: kube-root-ca.crt
items:
- key: ca.crt
path: ca.crt
- downwardAPI:
items:
- path: namespace
fieldRef:
fieldPath: metadata.namespace
apiVersion: v1
defaultMode: 420
hostname: alertmanager-0
nodeName: esr
priority: 0
dnsPolicy: ClusterFirst
subdomain: alertmanager
containers:
- name: alertmanager
image: quay.io/prometheus/alertmanager:v0.27.0
ports:
- name: http
protocol: TCP
containerPort: 9093
resources:
limits:
memory: 99M
requests:
cpu: 11m
memory: 50M
volumeMounts:
- name: alertmanager-storage
mountPath: /alertmanager
- name: config
mountPath: /etc/alertmanager
- name: kube-api-access-2fklr
readOnly: true
mountPath: /var/run/secrets/kubernetes.io/serviceaccount
imagePullPolicy: IfNotPresent
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
tolerations:
- key: node.kubernetes.io/not-ready
effect: NoExecute
operator: Exists
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
effect: NoExecute
operator: Exists
tolerationSeconds: 300
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: default
securityContext: {}
preemptionPolicy: PreemptLowerPriority
enableServiceLinks: false
serviceAccountName: default
automountServiceAccountToken: true
terminationGracePeriodSeconds: 30
status:
phase: Succeeded
podIP: 99.99.99.99
hostIP: 99.99.99.99
podIPs:
- ip: 99.99.99.99
qosClass: BestEffort
startTime: 2024-08-09T02:00:00Z
conditions:
- type: Initialized
reason: PodCompleted
status: "True"
- type: Ready
reason: PodCompleted
status: "False"
- type: ContainersReady
reason: PodCompleted
status: "False"
- type: PodScheduled
status: "True"
containerStatuses:
- name: aws-fargate-alert
image: flanksource.com/iiab-cronjobs:latest
ready: false
state:
terminated:
reason: Completed
exitCode: 0
startedAt: 2024-08-09T02:00:01Z
finishedAt: 2024-08-09T02:00:02Z
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
imageID: flanksource.com/iiab-cronjobs@sha256:9a560d72c176e0b77133f8df13acb3a3a761fbecbf5671c05a2d8b8b05450bc9
started: false
lastState: {}
containerID: containerd://a878069fc3ae58b76423d3dffbb4fc57959bb2e68534ce1cb88e468478cfc55a
restartCount: 0

0 comments on commit 4fd3150

Please sign in to comment.