Skip to content

[WIP] Add e2e test for tune api with LLM hyperparameter optimization #290

[WIP] Add e2e test for tune api with LLM hyperparameter optimization

[WIP] Add e2e test for tune api with LLM hyperparameter optimization #290

name: E2E Test with tune API
on:
pull_request:
paths-ignore:
- "pkg/ui/v1beta1/frontend/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
runs-on: ubuntu-22.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}
- name: Install Training Operator SDK
shell: bash
run: |
pip install "kubeflow-training[huggingface]==1.8.1"
- name: Check Disk Space Before Test
run: |
docker system prune -a
docker volume prune
echo "Checking disk space usage before e2e test..."
df -h # Run 'df' to check free disk space
- name: Monitor Memory Usage Before Run
if: always()
run: free -h
- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream
- name: Run e2e test with tune API
if: always()
uses: ./.github/workflows/template-e2e-test
with:
tune-api: true
training-operator: true
- name: Get YAML file of Experiment
if: always()
run: |
echo "Fetching the YAML file of the experiment..."
kubectl get experiment tune-example-2 -n default -o yaml
- name: Monitor Memory Usage After Run
if: always()
run: free -h
- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream
- name: Check Disk Space After Test
if: always() # Run this step even if previous steps fail
run: |
echo "Checking disk space usage after e2e test..."
df -h # Run 'df' to check free disk space
- name: Fetch Experiment Pod Logs
if: always() # Run this step even if previous steps fail
run: |
echo "Fetching all the pods in the default namespace..."
kubectl get pods -n default
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
echo "Fetching pod description for experiment pod..."
kubectl describe pod $POD_NAME -n default
echo "Fetching logs for experiment pod..."
kubectl logs $POD_NAME -n default --all-containers
echo "Fetching events for experiment pod..."
kubectl get events -n default | grep "tune-example-2"
- name: Fetch Kubelet Logs
if: always() # Run this step even if previous steps fail
run: |
echo "Fetching kubelet logs..."
sudo journalctl -u kubelet
- name: Check container runtime logs
if: always() # Run this step even if previous steps fail
run: |
echo "Checking container runtime logs..."
sudo journalctl -u docker
strategy:
fail-fast: false
matrix:
# Kubernetes versions to test with
kubernetes-version: ["v1.29.2"]