Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implementing healthcheck sidecar and probe #375

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions .github/workflows/pr-healthcheck-sidecar-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
name: Publish QA Healthcheck Sidecar Container mages

on:
push:
branches:
- main
paths:
- "healthcheck-sidecar/*"
- "!healthcheck-sidecar/README.md"

env:
GHCR_REGISTRY: ghcr.io
GHCR_HS_IMAGE_NAME: "${{ github.repository }}/healthcheck-sidecar"
QUAY_REGISTRY: quay.io
QUAY_HS_IMAGE_NAME: instructlab-ui/healthcheck-sidecar

jobs:
build_and_publish_hs_qa_image:
name: Push QA Healthcheck Sidecar container image to GHCR and QUAY
runs-on: ubuntu-latest
environment: registry-creds
permissions:
packages: write
contents: write
attestations: write
id-token: write

steps:
- name: Check out the repo
uses: actions/checkout@v4
with:
token: ${{ secrets.BOT_PAT }}
ref: 'main'

- name: Log in to the GHCR container image registry
uses: docker/login-action@v3
with:
registry: "${{ env.GHCR_REGISTRY }}"
username: "${{ github.actor }}"
password: "${{ secrets.GITHUB_TOKEN }}"

- name: Log in to the Quay container image registry
uses: docker/login-action@v3
with:
registry: "${{ env.QUAY_REGISTRY }}"
username: "${{ secrets.QUAY_USERNAME }}"
password: "${{ secrets.QUAY_TOKEN }}"

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Cache Docker layers
uses: actions/cache@v4
with:
path: /tmp/.buildx-cache
key: "${{ runner.os }}-buildx-${{ github.sha }}"
restore-keys: |
"${{ runner.os }}-buildx-"

- name: Get Pull Request Number from Commit
id: get_pr_number
uses: actions/github-script@v7
with:
script: |
console.log("Repository owner:", context.repo.owner);
console.log("Repository name:", context.repo.repo);
console.log("Current commit SHA:", context.sha);

const prs = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'closed',
sort: 'updated',
direction: 'desc'
});
console.log("Number of closed PRs fetched:", prs.data.length);

for (const pr of prs.data) {
console.log("Checking PR #", pr.number, "- Merged:");
if (pr.merged_at != "") {
console.log("Found merged PR:", pr.number);
return pr.number;
}
}

console.log("No merged PR found in the recent closed PRs.");
return '';

- name: Extract GHCR metadata (tags, labels) for HS image
id: ghcr_hs_meta
uses: docker/metadata-action@v5
with:
images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }}

- name: Extract Quay metadata (tags, labels) for HS image
id: quay_hs_meta
uses: docker/metadata-action@v5
with:
images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }}

- name: Build and push HS image to GHCR
id: push-hs-ghcr
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: |-
"${{ steps.ghcr_hs_meta.outputs.tags }}"
"${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}"
labels: ${{ steps.ghcr_hs_meta.outputs.labels }}
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: type=gha,mode=max
file: healthcheck-sidecar/Containerfile

- name: Generate GHCR artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME}}
subject-digest: ${{ steps.push-hs-ghcr.outputs.digest }}
push-to-registry: true

- name: Build and push HS image to QUAY
id: push-hs-quay
uses: docker/build-push-action@v6
with:
context: .
push: true
tags: |-
"${{ steps.quay_hs_meta.outputs.tags }}"
"${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}"
labels: ${{ steps.quay_hs_meta.outputs.labels }}
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: type=gha,mode=max
file: healthcheck-sidecar/Containerfile

- name: Generate QA HS Quay artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-name: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME}}
subject-digest: ${{ steps.push-hs-quay.outputs.digest }}
push-to-registry: true

- name: Update coderefs before code changes
run: |-
git pull --ff-only

- name: Update QA Quay HS image
id: update_qa_hs_manifest_image
env:
PR_TAG: "pr-${{ steps.get_pr_number.outputs.result }}"
run: |-
sudo wget https://github.com/mikefarah/yq/releases/download/v4.34.1/yq_linux_amd64 -O /usr/local/bin/yq
sudo chmod +x /usr/local/bin/yq
yq -i '
(.images[] | select(.name == "quay.io/instructlab-ui/healthcheck-sidecar") | .newTag) = env(PR_TAG)
' deploy/k8s/overlays/openshift/qa/kustomization.yaml

- name: Commit and push bump QA HS Image manifest
run: |-
git config user.name "platform-engineering-bot"
git config user.email "platform-engineering@redhat.com"
git add deploy/k8s/overlays/openshift/qa/kustomization.yaml
git commit -m "[CI AUTOMATION]: Bumping QA HS image to tag: pr-${{ steps.get_pr_number.outputs.result }}" -s
git push origin main

15 changes: 15 additions & 0 deletions deploy/k8s/base/ui/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,19 @@ spec:
envFrom:
- secretRef:
name: ui-config
readinessProbe:
exec:
command:
- sh
- -c
- "/opt/app-root/src/healthcheck-probe.sh"
initialDelaySeconds: 5
periodSeconds: 10
- name: model-endpoint-healthcheck-sidecar
image: quay.io/instructlab-ui/healthcheck-sidecar:PATCHED_FROM_OVERLAYS
ports:
- containerPort: 8080
envFrom:
- secretRef:
name: qa.env
restartPolicy: Always
2 changes: 2 additions & 0 deletions deploy/k8s/overlays/openshift/prod/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,5 @@ images:
newTag: v1.0.0-beta.3
- name: quay.io/instructlab-ui/pathservice
newTag: v1.0.0-beta.3
- name: quay.io/instructlab-ui/healthcheck-sidecar
newTag: latest
2 changes: 2 additions & 0 deletions deploy/k8s/overlays/openshift/qa/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,5 @@ images:
newTag: pr-377
- name: quay.io/instructlab-ui/pathservice
newTag: latest
- name: quay.io/instructlab-ui/healthcheck-sidecar
newTag: latest
9 changes: 9 additions & 0 deletions healthcheck-sidecar/Containerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM registry.access.redhat.com/ubi9-minimal:9.5-1731593028

RUN microdnf install -y jq python3 python3-pip

COPY sidecar-script.py requirements.txt /home

RUN python3 -m pip install -r /home/requirements.txt

ENTRYPOINT ["python3", "/home/sidecar-script.py"]
24 changes: 24 additions & 0 deletions healthcheck-sidecar/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Healthcheck Sidecar

This application is meant to be deployed alongside the ui pod.

## Development

### building

podman build . -f Containerfile --platform linux/amd64 -t quay.io/grpereir/ilab-ui-healthcheck-sidecar:latest

### running

podman run --platform linux/amd64 --rm -e IL_GRANITE_API=localhost -e IL_MERLINITE_API=localhost -it quay.io/grpereir/ilab-ui-healthcheck-sidecar:latest /bin/bash

### push

podman push quay.io/grpereir/ilab-ui-healthcheck-sidecar:latest

## How does it work

the sidecar-script.py is the entrypoint to the sidecar container. Its a simple python script that grabs the env variables for `IL_GRANITE_API` and `IL_MERLINITE_API`,
and uses those values to check the `/health` endpoint of the server. From there it aggregates the results of `curl`ing both a json object, and serves that as the
payload to `localhost:8080`. The UI deployment will then pick this up via a readinesProb, with the command being the contents of the `probe.sh` script. This script
has been added to the UI container build process, along with the installation of `jq` as its dependency via the UI Containerfile.
1 change: 1 addition & 0 deletions healthcheck-sidecar/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests
65 changes: 65 additions & 0 deletions healthcheck-sidecar/sidecar-script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import http.server
import socketserver
import json
import threading
import time
import requests
import os
import logging

################## SETUP LOGGING AND VALIDATE ENV ##################

logger = logging.getLogger(__name__)

def validate_env():
if not os.getenv("IL_GRANITE_API"):
error = "expecting granite API endpoint as env variable `$IL_GRANITE_API`, which does not exist."
logging.error(error)
raise ValueError(error)
if not os.getenv("IL_MERLINITE_API"):
error = "expecting merlinite API endpoint as env variable `$IL_MERLINITE_API`, which does not exist."
logging.error(error)
raise ValueError(error)

validate_env()

################## GLOBALS ##################

health_status = {
"granite_api": "unknown",
"merlinite_api": "unknown"
}

granite_api_health_url = f"{os.getenv('IL_GRANITE_API')}/health"
merlinite_api_health_url = f"{os.getenv('IL_MERLINITE_API')}/health"

# Update health status function
def update_health_status():
global health_status
while True:
try:
granite_api_health_response = requests.get(granite_api_health_url, timeout=5)
merlinite_api_health_response = requests.get(merlinite_api_health_url, timeout=5)
health_status["granite_api"] = "healthy" if granite_api_health_response.ok else "unhealthy"
health_status["merlinite_api"] = "healthy" if merlinite_api_health_response.ok else "unhealthy"
except requests.exceptions.RequestException:
health_status["granite_api"] = "unhealthy"
health_status["merlinite_api"] = "unhealthy"
time.sleep(10)

class HealthHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
if self.path == "/health":
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(health_status).encode())
else:
self.send_response(404)
self.end_headers()

threading.Thread(target=update_health_status, daemon=True).start()

with socketserver.TCPServer(("", 8080), HealthHandler) as httpd:
print("Serving health status on port 8080")
httpd.serve_forever()
4 changes: 4 additions & 0 deletions src/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@ FROM registry.access.redhat.com/ubi9/nodejs-22:9.5-1730543890
WORKDIR /opt/app-root/src

COPY package*.json ./
COPY src/healthcheck-probe.sh ./

USER root
RUN dnf install -y jq
RUN chown -R default:root /opt/app-root/src/package*.json
RUN chown -R default:root /opt/app-root/src/healthcheck-probe.sh
RUN chmod +x /opt/app-root/src/healthcheck-probe.sh
USER default
RUN npm install
COPY ./ .
Expand Down
26 changes: 26 additions & 0 deletions src/healthcheck-probe.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# -*- indent-tabs-mode: nil; tab-width: 2; sh-indentation: 2; -*-

# probe script to check to run as readinesProb (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes)
# requires jq

set -x
set -e
set -o pipefail

health_curl=$(curl localhost:8080/health)

granite_curl_healthy=$(echo "${health_curl}" | jq '. | select(.granite_api=="healthy")')
if [[ -z "${granite_curl_healthy}" ]]; then
echo "granite not healthy!"
exit 1
fi

merlinite_curl_healthy=$(echo "${health_curl}" | jq '. | select(.merlinite_api=="healthy")')
if [[ -z "${merlinite_curl_healthy}" ]]; then
echo "merlinite not healthy!"
exit 1
fi

exit 0