-
Notifications
You must be signed in to change notification settings - Fork 448
103 lines (86 loc) · 3.17 KB
/
e2e-test-tune-api.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
name: E2E Test with tune API
on:
pull_request:
paths-ignore:
- "pkg/ui/v1beta1/frontend/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e:
runs-on: ubuntu-22.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Test Env
uses: ./.github/workflows/template-setup-e2e-test
with:
kubernetes-version: ${{ matrix.kubernetes-version }}
- name: Install Training Operator SDK
shell: bash
run: |
pip install "kubeflow-training[huggingface]==1.8.1"
- name: Check Disk Space Before Test
run: |
docker system prune -a
docker volume prune
echo "Checking disk space usage before e2e test..."
df -h # Run 'df' to check free disk space
- name: Monitor Memory Usage Before Run
if: always()
run: free -h
- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream
- name: Run e2e test with tune API
if: always()
uses: ./.github/workflows/template-e2e-test
with:
tune-api: true
training-operator: true
- name: Get YAML file of Experiment
if: always()
run: |
echo "Fetching the YAML file of the experiment..."
kubectl get experiment tune-example-2 -n default -o yaml
- name: Monitor Memory Usage After Run
if: always()
run: free -h
- name: Monitor Docker Container Memory Usage
if: always()
run: |
docker stats --no-stream
- name: Check Disk Space After Test
if: always() # Run this step even if previous steps fail
run: |
echo "Checking disk space usage after e2e test..."
df -h # Run 'df' to check free disk space
- name: Fetch Experiment Pod Logs
if: always() # Run this step even if previous steps fail
run: |
echo "Fetching all the pods in the default namespace..."
kubectl get pods -n default
POD_NAME=$(kubectl get pods -n default --no-headers -o custom-columns=":metadata.name" | grep tune-example-2 | grep master)
echo "Fetching pod description for experiment pod..."
kubectl describe pod $POD_NAME -n default
echo "Fetching logs for experiment pod..."
kubectl logs $POD_NAME -n default --all-containers
echo "Fetching events for experiment pod..."
kubectl get events -n default | grep "tune-example-2"
- name: Fetch Kubelet Logs
if: always() # Run this step even if previous steps fail
run: |
echo "Fetching kubelet logs..."
sudo journalctl -u kubelet
- name: Check container runtime logs
if: always() # Run this step even if previous steps fail
run: |
echo "Checking container runtime logs..."
sudo journalctl -u docker
strategy:
fail-fast: false
matrix:
# Kubernetes versions to test with
kubernetes-version: ["v1.29.2"]