Skip to content

Commit

Permalink
feat: introduce health checks for Numaflow CRDs
Browse files Browse the repository at this point in the history
Signed-off-by: Dillen Padhiar <dillen_padhiar@intuit.com>
  • Loading branch information
dpadhiar committed Oct 8, 2024
1 parent dc27102 commit b59925a
Show file tree
Hide file tree
Showing 22 changed files with 1,352 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
local hs = {}
local healthyCondition = {}

if obj.status ~= nil then
if obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "ChildrenResourcesHealthy" then
healthyCondition = condition
end
end
end

if obj.metadata.generation == obj.status.observedGeneration then
if (healthyCondition ~= {} and healthyCondition.status == "False") or obj.status.phase == "Failed" then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
else
hs.message = healthyCondition.message
end
return hs
elseif (healthyCondition ~= {} and healthyCondition.status == "True") and obj.status.phase == "Running" then
hs.status = "Healthy"
hs.message = healthyCondition.message
return hs
end
end
end

hs.status = "Progressing"
hs.message = "Waiting for InterStepBufferService status"
return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
tests:
- healthStatus:
status: Progressing
message: "Waiting for InterStepBufferService status"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Healthy
message: "partitioned roll out complete: 3 new pods have been updated...\n"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: "Waiting for 3 pods to be ready...\n"
inputPath: testdata/degraded.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
creationTimestamp: "2024-10-08T16:32:12Z"
finalizers:
- isbsvc-controller
generation: 1
name: test-isbservice-rollout
namespace: numaplane-system
ownerReferences:
- apiVersion: numaplane.numaproj.io/v1alpha1
blockOwnerDeletion: true
controller: true
kind: ISBServiceRollout
name: test-isbservice-rollout
uid: 28fc22f1-83f3-441c-bf76-dd4a11ce3891
resourceVersion: "348334"
uid: 06a5cb44-f3f9-42f8-ab9e-7d2f10323f9e
spec:
jetstream:
containerTemplate:
resources:
limits:
memory: 10Mi
persistence:
volumeSize: 10Mi
version: 2.9.6
status:
conditions:
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: |
Waiting for 3 pods to be ready...
reason: Unavailable
status: "False"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T16:32:37Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-test-isbservice-rollout-js-client-auth
user:
key: client-auth-user
name: isbsvc-test-isbservice-rollout-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-test-isbservice-rollout-js-svc.numaplane-system.svc:4222
message: |
Unavailable: Waiting for 3 pods to be ready...
observedGeneration: 1
phase: Running
type: jetstream
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}}
creationTimestamp: "2024-10-08T18:21:09Z"
finalizers:
- isbsvc-controller
generation: 1
name: default
namespace: numaflow-system
resourceVersion: "357862"
uid: e175db66-3918-4ef8-993d-12b37eb9a964
spec:
jetstream:
persistence:
volumeSize: 3Gi
replicas: 3
version: latest
status:
conditions:
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: |
partitioned roll out complete: 3 new pods have been updated...
reason: Healthy
status: "True"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-default-js-client-auth
user:
key: client-auth-user
name: isbsvc-default-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222
observedGeneration: 1
phase: Running
type: jetstream
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: InterStepBufferService
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"numaflow.numaproj.io/v1alpha1","kind":"InterStepBufferService","metadata":{"annotations":{},"name":"default","namespace":"numaflow-system"},"spec":{"jetstream":{"persistence":{"volumeSize":"3Gi"},"version":"latest"}}}
creationTimestamp: "2024-10-08T18:21:09Z"
finalizers:
- isbsvc-controller
generation: 2
name: default
namespace: numaflow-system
resourceVersion: "357862"
uid: e175db66-3918-4ef8-993d-12b37eb9a964
spec:
jetstream:
persistence:
volumeSize: 3Gi
replicas: 3
version: latest
status:
conditions:
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: |
partitioned roll out complete: 3 new pods have been updated...
reason: Healthy
status: "True"
type: ChildrenResourcesHealthy
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Configured
- lastTransitionTime: "2024-10-08T18:21:53Z"
message: Successful
reason: Successful
status: "True"
type: Deployed
config:
jetstream:
auth:
basic:
password:
key: client-auth-password
name: isbsvc-default-js-client-auth
user:
key: client-auth-user
name: isbsvc-default-js-client-auth
streamConfig: |
consumer:
ackwait: 60s
maxackpending: 25000
otbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 3h
procbucket:
history: 1
maxbytes: 0
maxvaluesize: 0
replicas: 3
storage: 0
ttl: 72h
stream:
duplicates: 60s
maxage: 72h
maxbytes: -1
maxmsgs: 100000
replicas: 3
retention: 0
storage: 0
url: nats://isbsvc-default-js-svc.numaflow-system.svc:4222
observedGeneration: 1
phase: Running
type: jetstream
40 changes: 40 additions & 0 deletions resource_customizations/numaflow.numaproj.io/MonoVertex/health.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
local hs = {}
local podsHealth = {}
local daemonHealth = {}

if obj.status ~= nil then
if obj.status.conditions ~= nil then
for i, condition in ipairs(obj.status.conditions) do
if condition.type == "PodsHealthy" then
podsHealth = condition
end
if condition.type == "DaemonHealthy" then
daemonHealth = condition
end
end
end

if obj.metadata.generation == obj.status.observedGeneration then
if (podsHealth ~= {} and podsHealth.status == "False") or (daemonHealth ~= {} and daemonHealth.status == "False") or obj.status.phase == "Failed" then
hs.status = "Degraded"
if obj.status.phase == "Failed" then
hs.message = obj.status.message
else
hs.message = "Subresources are unhealthy"
end
return hs
elseif obj.status.phase == "Paused" then
hs.status = "Suspended"
hs.message = "MonoVertex is paused"
return hs
elseif (podsHealth ~= {} and podsHealth.status == "True") and (daemonHealth ~= {} and daemonHealth.status == "True") and obj.status.phase == "Running" then
hs.status = "Healthy"
hs.message = "MonoVertex is healthy"
return hs
end
end
end

hs.status = "Progressing"
hs.message = "Waiting for MonoVertex status"
return hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
tests:
- healthStatus:
status: Progressing
message: "Waiting for MonoVertex status"
inputPath: testdata/progressing.yaml
- healthStatus:
status: Healthy
message: "MonoVertex is healthy"
inputPath: testdata/healthy.yaml
- healthStatus:
status: Degraded
message: "Subresources are unhealthy"
inputPath: testdata/degraded.yaml
- healthStatus:
status: Suspended
message: "MonoVertex is paused"
inputPath: testdata/suspended.yaml
Loading

0 comments on commit b59925a

Please sign in to comment.