From 9c66cd52105745c2d30190619872a9dbf1aad101 Mon Sep 17 00:00:00 2001 From: Alexey Kondratiev Date: Tue, 3 Sep 2024 20:16:32 +0000 Subject: [PATCH 1/8] Enabling kernels tests, ignoring some of then that fail --- .buildkite/run-amd-test.sh | 67 +++++++++++++++++++++++++++++++---- .buildkite/test-pipeline.yaml | 1 + 2 files changed, 62 insertions(+), 6 deletions(-) mode change 100644 => 100755 .buildkite/run-amd-test.sh diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh old mode 100644 new mode 100755 index 5548071390aff..ff9c5116e030a --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,5 +1,6 @@ # This script runs test inside the corresponding ROCm docker container. -set -ex +#set -ex +set -o pipefail # Print ROCm version echo "--- Confirming Clean Initial State" @@ -70,16 +71,70 @@ HF_CACHE="$(realpath ~)/huggingface" mkdir -p ${HF_CACHE} HF_MOUNT="/root/.cache/huggingface" -docker run \ +commands=$@ +echo "Commands:$commands" +#ignore certain kernels tests +if [[ $commands == *" kernels "* ]]; then + commands="${commands} \ + --ignore=kernels/test_attention.py \ + --ignore=kernels/test_attention_selector.py \ + --ignore=kernels/test_blocksparse_attention.py \ + --ignore=kernels/test_cutlass.py \ + --ignore=kernels/test_encoder_decoder_attn.py \ + --ignore=kernels/test_flash_attn.py \ + --ignore=kernels/test_flashinfer.py \ + --ignore=kernels/test_int8_quant.py \ + --ignore=kernels/test_machete_gemm.py \ + --ignore=kernels/test_marlin_gemm.py \ + --ignore=kernels/test_prefix_prefill.py \ + --ignore=kernels/test_rand.py \ + --ignore=kernels/test_sampler.py" +fi + +PARALLEL_JOB_COUNT=8 +#check if the command contains shard flag +if [[ $commands == *"--shard-id="* ]]; then + for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do + #replace shard arguments + commands=${@//"--shard-id= "/"--shard-id=${GPU} "} + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + docker run \ --device /dev/kfd --device /dev/dri \ --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=0 \ + -e HIP_VISIBLE_DEVICES=${GPU} \ -e HF_TOKEN \ -v ${HF_CACHE}:${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ + --name ${container_name}_${GPU} \ ${image_name} \ - /bin/bash -c "${@}" - + /bin/bash -c "${commands}" \ + |& while read -r line; do echo ">>Shard $GPU: $line"; done & + PIDS+=($!) + done + #wait for all processes to finish and collect exit codes + for pid in ${PIDS[@]}; do + wait ${pid} + STATUS+=($?) + done + for st in ${STATUS[@]}; do + if [[ ${st} -ne 0 ]]; then + echo "One of the processes failed with $st" + exit ${st} + fi + done +else + docker run \ + --device /dev/kfd --device /dev/dri \ + --network host \ + --shm-size=16gb \ + --rm \ + -e HIP_VISIBLE_DEVICES=0 \ + -e HF_TOKEN \ + -v ${HF_CACHE}:${HF_MOUNT} \ + -e HF_HOME=${HF_MOUNT} \ + --name ${container_name} \ + ${image_name} \ + /bin/bash -c "${commands}" +fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 9f449ff650b90..73a0c2ff1a954 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -224,6 +224,7 @@ steps: parallelism: 4 - label: Kernels Test %N # 30min each + mirror_hardwares: [amd] source_file_dependencies: - csrc/ - vllm/attention From fc4c1b0cde73ba696a0a019de58f33f7ff8134b7 Mon Sep 17 00:00:00 2001 From: Alexey Kondratiev Date: Tue, 3 Sep 2024 21:34:41 +0000 Subject: [PATCH 2/8] Trigger Build From fb113d3b22ef146d3d57884dfb5d755cf868d6f9 Mon Sep 17 00:00:00 2001 From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com> Date: Wed, 4 Sep 2024 20:35:34 -0400 Subject: [PATCH 3/8] Adding one more test to ignore list. --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index fbe2afaa43bf2..a7fcbf31417b3 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -78,6 +78,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_attention.py \ --ignore=kernels/test_attention_selector.py \ --ignore=kernels/test_blocksparse_attention.py \ + --ignore=kernels/test_causal_conv1d.py \ --ignore=kernels/test_cutlass.py \ --ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_flash_attn.py \ From 6b6f0c12cc2abc48de59bb18b6c552657b24bc21 Mon Sep 17 00:00:00 2001 From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:46:08 -0400 Subject: [PATCH 4/8] Trying with four parallel jobs instead of eight --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index a7fcbf31417b3..cadbd8c9d2240 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -91,7 +91,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_sampler.py" fi -PARALLEL_JOB_COUNT=8 +PARALLEL_JOB_COUNT=4 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do From 06e6d00188e65e709556e7b24d2d06820eda9f54 Mon Sep 17 00:00:00 2001 From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:55:38 -0400 Subject: [PATCH 5/8] Adding test_mamba_ssm.py to ignore list --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index cadbd8c9d2240..cdec57602e28f 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -85,6 +85,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_flashinfer.py \ --ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_machete_gemm.py \ + --ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/test_marlin_gemm.py \ --ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/test_rand.py \ From eab96dce68fcf077f896a40cc8f4acc6c675aa94 Mon Sep 17 00:00:00 2001 From: alexeykondrat <143633163+alexeykondrat@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:31:16 -0400 Subject: [PATCH 6/8] Switching back to 8 parallel jobs --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index cdec57602e28f..c9b72a3264e82 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -92,7 +92,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_sampler.py" fi -PARALLEL_JOB_COUNT=4 +PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do From b193d64d277e4cf472851a7cfdfa35dbba3d598a Mon Sep 17 00:00:00 2001 From: "Alexey Kondratiev(AMD)" <143633163+alexeykondrat@users.noreply.github.com> Date: Wed, 11 Sep 2024 12:00:38 -0400 Subject: [PATCH 7/8] Excluding test_moe.py test --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index c9b72a3264e82..6659440135ff4 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -87,6 +87,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/test_marlin_gemm.py \ + --ignore=kernels/test_moe.py \ --ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/test_rand.py \ --ignore=kernels/test_sampler.py" From 43eeb136077cd566cdd59900c5af521d29c7c70a Mon Sep 17 00:00:00 2001 From: "Alexey Kondratiev(AMD)" <143633163+alexeykondrat@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:16:13 -0400 Subject: [PATCH 8/8] Excluding kernels/test_gguf.py tests --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 6659440135ff4..9274a30e04325 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flashinfer.py \ + --ignore=kernels/test_gguf.py \ --ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_mamba_ssm.py \