Works with uint8 models #6
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Benchmarks | |
env: | |
# TODO: this rescheduling makes gpt2, mixtral and llama unjitted slower | |
RUN_PROCESS_REPLAY: "1" | |
ASSERT_PROCESS_REPLAY: "0" | |
PYTHONPATH: . | |
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
on: | |
push: | |
branches: | |
- master | |
- update_benchmark | |
- update_benchmark_staging | |
workflow_dispatch: | |
inputs: | |
run_process_replay: | |
description: "Run process replay tests" | |
required: false | |
default: false | |
type: boolean | |
jobs: | |
testmacbenchmark: | |
name: Mac Benchmark | |
runs-on: [self-hosted, macOS] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/extra/disassemblers/applegpu extra/disassemblers/applegpu | |
ln -s ~/tinygrad/weights/sd-v1-4.ckpt weights/sd-v1-4.ckpt | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: Run Stable Diffusion | |
run: JIT=2 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt | |
- name: Run Stable Diffusion with fp16 | |
run: JIT=2 python3 examples/stable_diffusion.py --fp16 --seed 0 --noshow --timing | tee sd_fp16.txt | |
- name: Run SDXL | |
run: JIT=2 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
- name: Run model inference benchmark | |
run: METAL=1 python3 test/external/external_model_benchmark.py | |
- name: Test speed vs torch | |
run: BIG=2 MPS=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Test tensor cores | |
run: METAL=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded | |
- name: Run Tensor Core GEMM | |
run: | | |
DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
DEBUG=2 HALF=1 python3 extra/gemm/simple_matmul.py | tee matmul_half.txt | |
- name: Fuzz Padded Tensor Core GEMM | |
run: METAL=1 M_START=6 M_STOP=10 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=6 K_STOP=24 K_STEP=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py | |
- name: Run LLaMA | |
run: | | |
JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
- name: Run LLaMA with BEAM | |
run: JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
- name: Run quantized LLaMA | |
run: | | |
python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize int8 | tee llama_int8.txt | |
python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing --quantize nf4 | tee llama_nf4.txt | |
- name: Run LLaMA 7B on 4 (virtual) GPUs | |
run: python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
- name: Run GPT2 | |
run: | | |
JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run GPT2 w HALF | |
run: HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
- name: Run GPT2 w HALF/BEAM | |
run: HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
- name: Train MNIST | |
run: time PYTHONPATH=. TARGET_EVAL_ACC_PCT=96.9 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
- name: Run 10 CIFAR training steps | |
run: JIT=2 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
- name: Run 10 CIFAR training steps w HALF | |
run: JIT=2 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
#- name: Run 10 CIFAR training steps w BF16 | |
# run: STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
- name: Run 10 CIFAR training steps w winograd | |
run: JIT=2 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (Mac) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
llama_unjitted.txt | |
llama_jitted.txt | |
llama_beam.txt | |
llama_int8.txt | |
llama_nf4.txt | |
llama_four_gpu.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
gpt2_half.txt | |
gpt2_half_beam.txt | |
matmul.txt | |
matmul_half.txt | |
sd.txt | |
sd_fp16.txt | |
sdxl.txt | |
beautiful_mnist.txt | |
train_cifar.txt | |
train_cifar_half.txt | |
train_cifar_bf16.txt | |
train_cifar_wino.txt | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
testnvidiabenchmark: | |
name: tinybox green Benchmark | |
runs-on: [self-hosted, Linux, tinyboxgreen] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Print nvidia-smi | |
run: nvidia-smi | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
mkdir -p extra/datasets | |
ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: Run model inference benchmark | |
run: NV=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
- name: Test speed vs torch | |
run: NV=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Test tensor cores | |
run: | | |
NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded | |
PTX=1 NV=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded | |
- name: Run Tensor Core GEMM (CUDA) | |
run: | | |
CUDA=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt | |
CUDA=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt | |
- name: Run Tensor Core GEMM (PTX) | |
run: NV=1 PTX=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt | |
- name: Run Tensor Core GEMM (NV) | |
run: NV=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt | |
- name: Run Tensor Core GEMM (NV) with BEAM | |
run: BEAM=4 NV=1 HALF=1 IGNORE_BEAM_CACHE=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | |
- name: Run Stable Diffusion | |
run: NV=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt | |
- name: Run SDXL | |
run: NV=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
- name: Run LLaMA | |
run: | | |
NV=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
NV=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
- name: Run LLaMA with BEAM | |
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
- name: Run LLaMA 7B on 4 GPUs | |
run: NV=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
- name: Run LLaMA 7B on 6 GPUs | |
run: NV=1 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
- name: Run LLaMA-3 8B BEAM | |
run: NV=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
- name: Run LLaMA-3 8B on 4 GPUs | |
run: NV=1 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
- name: Run LLaMA-3 8B on 6 GPUs | |
run: NV=1 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
- name: Run LLaMA-2 70B | |
run: NV=1 MAX_CONTEXT=256 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
- name: Run Mixtral 8x7B | |
run: time NV=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
- name: Run GPT2 | |
run: | | |
NV=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
NV=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run GPT2 w HALF | |
run: NV=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
- name: Run GPT2 w HALF/BEAM | |
run: NV=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (NVIDIA) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
matmul.txt | |
matmul_bfloat16.txt | |
matmul_ptx.txt | |
matmul_nv.txt | |
sd.txt | |
sdxl.txt | |
llama_unjitted.txt | |
llama_jitted.txt | |
llama_beam.txt | |
llama_four_gpu.txt | |
llama_six_gpu.txt | |
llama3_beam.txt | |
llama3_four_gpu.txt | |
llama3_six_gpu.txt | |
llama_2_70B.txt | |
mixtral.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
gpt2_half.txt | |
gpt2_half_beam.txt | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
testmorenvidiabenchmark: | |
name: tinybox green Training Benchmark | |
runs-on: [self-hosted, Linux, tinyboxgreen] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
mkdir -p extra/datasets | |
ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: Fuzz Padded Tensor Core GEMM (NV) | |
run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
- name: Fuzz Padded Tensor Core GEMM (PTX) | |
run: NV=1 PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py | |
- name: Train MNIST | |
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.9 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
- name: Run 10 CIFAR training steps | |
run: NV=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
- name: Run 10 CIFAR training steps w HALF | |
run: NV=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
- name: Run 10 CIFAR training steps w BF16 | |
run: NV=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
- name: Run 10 CIFAR training steps w winograd | |
run: NV=1 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
- name: Run full CIFAR training w 1 GPU | |
run: time NV=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
- name: Run full CIFAR training steps w 6 GPUS | |
run: time NV=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
- name: Run MLPerf resnet eval on training data | |
run: time NV=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
- name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
- name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
run: NV=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (NVIDIA Training) | |
path: | | |
beautiful_mnist.txt | |
train_cifar.txt | |
train_cifar_half.txt | |
train_cifar_bf16.txt | |
train_cifar_wino.txt | |
train_cifar_one_gpu.txt | |
train_resnet.txt | |
train_resnet_one_gpu.txt | |
train_cifar_six_gpu.txt | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
testamdbenchmark: | |
name: tinybox red Benchmark | |
runs-on: [self-hosted, Linux, tinybox] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
ln -s /raid/weights/LLaMA-3 weights/LLaMA-3 | |
mkdir -p extra/datasets | |
ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: Show off tinybox | |
run: /opt/rocm/bin/rocm-bandwidth-test | |
# TODO: unstable on AMD | |
#- name: Run model inference benchmark | |
# run: LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 NOCLANG=1 python3 test/external/external_model_benchmark.py | |
# TODO: unstable on AMD | |
#- name: Test speed vs torch | |
# run: | | |
# python3 -c "import torch; print(torch.__version__)" | |
# LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/test_speed_v_torch.py | tee torch_speed.txt | |
- name: Test tensor cores | |
run: | | |
AMD=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded | |
- name: Run Tensor Core GEMM (AMD) | |
run: AMD=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_amd.txt | |
# TODO: AMD compiler bug causes this to fail | |
#- name: Fuzz Padded Tensor Core GEMM | |
# run: HSA=1 M_START=12 M_STOP=20 M_STEP=1 N_START=12 N_STOP=20 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 DEBUG=2 python3 ./extra/gemm/fuzz_matmul.py | |
- name: Run Stable Diffusion | |
run: AMD=1 python3 examples/stable_diffusion.py --seed 0 --noshow --timing | tee sd.txt | |
- name: Run SDXL | |
run: AMD=1 python3 examples/sdxl.py --seed 0 --noshow --timing | tee sdxl.txt | |
- name: Run LLaMA 7B | |
run: | | |
AMD=1 JIT=0 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_unjitted.txt | |
AMD=1 JIT=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_jitted.txt | |
- name: Run LLaMA 7B with BEAM | |
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama.py --gen 1 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_beam.txt | |
- name: Run LLaMA 7B on 4 GPUs | |
run: AMD=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_four_gpu.txt | |
- name: Run LLaMA 7B on 6 GPUs | |
run: AMD=1 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt | |
- name: Run LLaMA-3 8B BEAM | |
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt | |
- name: Run LLaMA-3 8B on 4 GPUs | |
run: AMD=1 python3 examples/llama3.py --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt | |
- name: Run LLaMA-3 8B on 6 GPUs | |
run: AMD=1 python3 examples/llama3.py --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt | |
- name: Run LLaMA-2 70B | |
run: AMD=1 python3 examples/llama.py --gen 2 --size 70B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_2_70B.txt | |
- name: Run Mixtral 8x7B | |
run: time AMD=1 python3 examples/mixtral.py --temperature 0 --count 10 --timing | tee mixtral.txt | |
- name: Run GPT2 | |
run: | | |
AMD=1 JIT=0 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_unjitted.txt | |
AMD=1 JIT=1 python3 examples/gpt2.py --prompt "Hello." --count 10 --temperature 0 --timing | tee gpt2_jitted.txt | |
- name: Run GPT2 w HALF | |
run: AMD=1 HALF=1 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half.txt | |
- name: Run GPT2 w HALF/BEAM | |
run: AMD=1 HALF=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAST_BEFORE_VIEW=0 python3 examples/gpt2.py --count 10 --temperature 0 --timing | tee gpt2_half_beam.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (AMD) | |
path: | | |
onnx_inference_speed.csv | |
torch_speed.txt | |
llama_unjitted.txt | |
llama_jitted.txt | |
llama_beam.txt | |
llama_four_gpu.txt | |
llama_six_gpu.txt | |
llama3_beam.txt | |
llama3_four_gpu.txt | |
llama3_six_gpu.txt | |
llama_2_70B.txt | |
gpt2_unjitted.txt | |
gpt2_jitted.txt | |
gpt2_half.txt | |
gpt2_half_beam.txt | |
matmul.txt | |
matmul_amd.txt | |
sd.txt | |
sdxl.txt | |
mixtral.txt | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
testmoreamdbenchmark: | |
name: tinybox red Training Benchmark | |
runs-on: [self-hosted, Linux, tinybox] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: Symlink models and datasets | |
run: | | |
mkdir -p weights | |
ln -s ~/tinygrad/weights/bpe_simple_vocab_16e6.txt.gz weights/bpe_simple_vocab_16e6.txt.gz | |
ln -s ~/tinygrad/weights/LLaMA weights/LLaMA | |
ln -s ~/tinygrad/extra/datasets/cifar-10-python.tar.gz extra/datasets/cifar-10-python.tar.gz | |
ln -s /raid/weights/mixtral-8x7b-32kseqlen weights/mixtral-8x7b-32kseqlen | |
ln -s /raid/weights/LLaMA-2 weights/LLaMA-2 | |
mkdir -p extra/datasets | |
ln -s /raid/datasets/imagenet extra/datasets/imagenet | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: Train MNIST | |
run: time PYTHONPATH=. AMD=1 TARGET_EVAL_ACC_PCT=96.9 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt | |
- name: Run 10 CIFAR training steps | |
run: AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt | |
- name: Run 10 CIFAR training steps w HALF | |
run: AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt | |
- name: Run 10 CIFAR training steps w BF16 | |
run: AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt | |
- name: Run 10 CIFAR training steps w winograd | |
run: AMD=1 WINO=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt | |
- name: Run full CIFAR training w 1 GPU | |
run: time AMD=1 DEFAULT_FLOAT=HALF LATEWINO=1 STEPS=1000 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_one_gpu.txt | |
- name: Run full CIFAR training steps w 6 GPUS | |
run: time AMD=1 DEFAULT_FLOAT=HALF STEPS=350 BS=1536 GPUS=6 TARGET_EVAL_ACC_PCT=93.2 python3 examples/hlb_cifar10.py | tee train_cifar_six_gpu.txt | |
- name: Run MLPerf resnet eval | |
run: time AMD=1 MODEL=resnet python3 examples/mlperf/model_eval.py | |
- name: Run 10 MLPerf ResNet50 training steps (1 gpu) | |
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=256 GPUS=1 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet_one_gpu.txt | |
- name: Run 10 MLPerf ResNet50 training steps (6 gpu) | |
run: AMD=1 DEFAULT_FLOAT=HALF BENCHMARK=10 BS=1536 GPUS=6 MODEL=resnet python3 examples/mlperf/model_train.py | tee train_resnet.txt | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (AMD Training) | |
path: | | |
beautiful_mnist.txt | |
train_cifar.txt | |
train_cifar_half.txt | |
train_cifar_bf16.txt | |
train_cifar_wino.txt | |
train_cifar_one_gpu.txt | |
train_resnet.txt | |
train_resnet_one_gpu.txt | |
train_cifar_six_gpu.txt | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
testqualcommbenchmark: | |
name: comma Benchmark | |
runs-on: [self-hosted, Linux, comma] | |
defaults: | |
run: | |
shell: bash -o pipefail {0} | |
if: github.repository_owner == 'tinygrad' | |
steps: | |
- name: Checkout Code | |
uses: actions/checkout@v4 | |
- name: setup staging db | |
if: github.ref == 'refs/heads/update_benchmark_staging' | |
run: | | |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV | |
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal | |
- name: reset process replay | |
run: test/external/process_replay/reset.py | |
- name: openpilot compile 0.9.4 | |
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py | tee openpilot_compile_0_9_4.txt | |
- name: openpilot compile 0.9.7 | |
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python examples/openpilot/compile2.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_compile_0_9_7.txt | |
- name: validate openpilot 0.9.7 | |
run: PYTHONPATH=. FLOAT16=0 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt | |
- name: benchmark openpilot 0.9.4 | |
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_4.txt | |
- name: benchmark openpilot 0.9.7 | |
run: PYTHONPATH=. QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_0_9_7.txt | |
- name: benchmark openpilot w IMAGE=2 0.9.4 | |
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_4.txt | |
- name: benchmark openpilot w IMAGE=2 0.9.7 | |
run: PYTHONPATH=. NOLOCALS=1 FLOAT16=1 IMAGE=2 QCOM=1 taskset -c 4-7 python3 test/external/external_benchmark_openpilot.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | tee openpilot_image_0_9_7.txt | |
- name: openpilot compile3 0.9.7 | |
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx | |
- name: openpilot compile3 0.9.7+ tomb raider | |
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/e8bea2c78ffa92685ece511e9b554122aaf1a79d/selfdrive/modeld/models/supercombo.onnx | |
- name: openpilot dmonitoring compile3 0.9.7 | |
run: PYTHONPATH="." QCOM=1 taskset -c 4-7 python3 examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/dmonitoring_model.onnx | |
- name: Run process replay tests | |
run: cp test/external/process_replay/process_replay.py ./process_replay.py && git fetch origin master && git -c advice.detachedHead=false checkout origin/master && PYTHONPATH=. python3 process_replay.py | |
- uses: actions/upload-artifact@v4 | |
with: | |
name: Speed (comma) | |
path: | | |
openpilot_compile_0_9_4.txt | |
openpilot_compile_0_9_7.txt | |
openpilot_0_9_4.txt | |
openpilot_0_9_7.txt | |
openpilot_image_0_9_4.txt | |
openpilot_image_0_9_7.txt |