[WIP] Add e2e test for tune
api with LLM hyperparameter optimization
#290
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with tune API | |
on: | |
pull_request: | |
paths-ignore: | |
- "pkg/ui/v1beta1/frontend/**" | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: ubuntu-22.04 | |
timeout-minutes: 120 | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Setup Test Env | |
uses: ./.github/workflows/template-setup-e2e-test | |
with: | |
kubernetes-version: ${{ matrix.kubernetes-version }} | |
- name: Install Training Operator SDK | |
shell: bash | |
run: | | |
pip install "kubeflow-training[huggingface]==1.8.1" | |
- name: Check Disk Space Before Test | |
run: | | |
docker system prune -a | |
docker volume prune | |
echo "Checking disk space usage before e2e test..." | |
df -h # Run 'df' to check free disk space | |
- name: Monitor Memory Usage Before Run | |
if: always() | |
run: free -h | |
- name: Monitor Docker Container Memory Usage | |
if: always() | |
run: | | |
docker stats --no-stream | |
- name: Run e2e test with tune API | |
if: always() | |
uses: ./.github/workflows/template-e2e-test | |
with: | |
tune-api: true | |
training-operator: true | |
- name: Get YAML file of Experiment | |
if: always() | |
run: | | |
echo "Fetching the YAML file of the experiment..." | |
kubectl get experiment tune-example-2 -n default -o yaml | |
- name: Monitor Memory Usage After Run | |
if: always() | |
run: free -h | |
- name: Monitor Docker Container Memory Usage | |
if: always() | |
run: | | |
docker stats --no-stream | |
- name: Check Disk Space After Test | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking disk space usage after e2e test..." | |
df -h # Run 'df' to check free disk space | |
- name: Fetch Experiment Pod Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching all the pods in the default namespace..." | |
kubectl get pods -n default | |
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master) | |
echo "Fetching pod description for experiment pod..." | |
kubectl describe pod $POD_NAME -n default | |
echo "Fetching logs for experiment pod..." | |
kubectl logs $POD_NAME -n default --all-containers | |
echo "Fetching events for experiment pod..." | |
kubectl get events -n default | grep "tune-example-2" | |
- name: Fetch Kubelet Logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Fetching kubelet logs..." | |
sudo journalctl -u kubelet | |
- name: Check container runtime logs | |
if: always() # Run this step even if previous steps fail | |
run: | | |
echo "Checking container runtime logs..." | |
sudo journalctl -u docker | |
strategy: | |
fail-fast: false | |
matrix: | |
# Kubernetes versions to test with | |
kubernetes-version: ["v1.29.2"] |