-
Notifications
You must be signed in to change notification settings - Fork 97
87 lines (85 loc) · 4.13 KB
/
t3000-model-perf-tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
name: "(T3K) T3000 model perf tests"
on:
workflow_dispatch:
schedule:
- cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
secrets: inherit
t3000-model-perf-tests:
needs: build-artifact
strategy:
fail-fast: false
matrix:
test-group: [
{ name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum
{ name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, # Colman Glagovich
{ name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: U053W15B6JF}, # Djordje Ivanovic
{ name: "t3k CNN resnet50 model perf tests", model: "resnet50", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_resnet50_tests, timeout: 75, owner_id: U013121KDH9}, # Austin Ho
#{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
]
name: ${{ matrix.test-group.name }}
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.test-group.arch }}
LOGURU_LEVEL: INFO
LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib
environment: dev
runs-on: ["arch-wormhole_b0", "config-t3000", "in-service", "pipeline-perf"]
steps:
- uses: tenstorrent-metal/metal-workflows/.github/actions/checkout-with-submodule-lfs@v2.0.0
- name: Enable performance mode
run: |
sudo cpupower frequency-set -g performance
- name: Ensure weka mount is active
run: |
sudo systemctl restart mnt-MLPerf.mount
sudo /etc/rc.local
ls -al /mnt/MLPerf/bit_error_tests
- name: Set up dynamic env vars for build
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
shell: bash {0}
timeout-minutes: ${{ matrix.test-group.timeout }}
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
export PYTHONPATH=$TT_METAL_HOME
source ${{ github.workspace }}/tests/scripts/t3000/run_t3000_model_perf_tests.sh
${{ matrix.test-group.cmd }}
env python models/perf/merge_perf_results.py
- name: Check perf report exists
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME="Models_Perf_$(date +%Y_%m_%d).csv"
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
slack_webhook_url: ${{ secrets.SLACK_WEBHOOK_URL }}
owner: ${{ matrix.test-group.owner_id }}
- name: Disable performance mode
if: always()
run: |
sudo cpupower frequency-set -g ondemand