From 5e680272bfdb31892e482c94678af99755e6ebd2 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Jun 2024 18:57:39 +0000 Subject: [PATCH 1/3] p Signed-off-by: kevin --- .buildkite/run-benchmarks.sh | 2 +- .buildkite/test-pipeline.yaml | 156 -------------------------------- .buildkite/test-template-aws.j2 | 19 +--- 3 files changed, 2 insertions(+), 175 deletions(-) diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh index 75e9cf6a6579a..cbf6dda677c53 100644 --- a/.buildkite/run-benchmarks.sh +++ b/.buildkite/run-benchmarks.sh @@ -54,7 +54,7 @@ tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines echo '```' >> benchmark_results.md # if the agent binary is not found, skip uploading the results, exit 0 -if [ ! -f buildkite-agent ]; then +if [ ! -f /usr/bin/buildkite-agent ]; then exit 0 fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b48ef31bc4163..346fb1b63cdd3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -4,165 +4,9 @@ # the final pipeline yaml file. steps: -- label: Regression Test - mirror_hardwares: [amd] - command: pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: AsyncEngine Test - #mirror_hardwares: [amd] - command: pytest -v -s async_engine - -- label: Basic Correctness Test - mirror_hardwares: [amd] - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - -- label: Core Test - mirror_hardwares: [amd] - command: pytest -v -s core - -- label: Distributed Comm Ops Test - #mirror_hardwares: [amd] - command: pytest -v -s distributed/test_comm_ops.py - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - -- label: Distributed Tests - mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - -- label: Distributed Tests (Multiple Groups) - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - pytest -v -s distributed/test_pynccl.py - -- label: Engine Test - mirror_hardwares: [amd] - command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - -- label: Entrypoints Test - mirror_hardwares: [amd] - - commands: - - pytest -v -s entrypoints -m llm - - pytest -v -s entrypoints -m openai - -- label: Examples Test - working_dir: "/vllm-workspace/examples" - mirror_hardwares: [amd] - commands: - # install aws cli for llava_example.py - # install tensorizer for tensorize_vllm_model.py - - pip install awscli tensorizer - - python3 offline_inference.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 llava_example.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - -- label: Inputs Test - #mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s test_inputs.py - - pytest -v -s multimodal - -- label: Kernels Test %N - #mirror_hardwares: [amd] - command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 - -- label: Models Test - #mirror_hardwares: [amd] - commands: - - pytest -v -s models -m \"not llava\" - -- label: Llava Test - mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s models -m llava - -- label: Prefix Caching Test - mirror_hardwares: [amd] - commands: - - pytest -v -s prefix_caching - -- label: Samplers Test - #mirror_hardwares: [amd] - command: pytest -v -s samplers - -- label: LogitsProcessor Test - mirror_hardwares: [amd] - command: pytest -v -s test_logits_processor.py - -- label: Utils Test - command: pytest -v -s test_utils.py - -- label: Worker Test - mirror_hardwares: [amd] - command: pytest -v -s worker - -- label: Speculative decoding tests - #mirror_hardwares: [amd] - commands: - # See https://github.com/vllm-project/vllm/issues/5152 - - export VLLM_ATTENTION_BACKEND=XFORMERS - - pytest -v -s spec_decode - -- label: LoRA Test %N - #mirror_hardwares: [amd] - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py - parallelism: 4 - -- label: LoRA Long Context (Distributed) - #mirror_hardwares: [amd] - num_gpus: 4 - # This test runs llama 13B, so it is required to run on 4 GPUs. - commands: - - pytest -v -s -x lora/test_long_context.py - -- label: Tensorizer Test - #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader - -- label: Metrics Test - mirror_hardwares: [amd] - command: pytest -v -s metrics - -- label: Quantization Test - #mirror_hardwares: [amd] - command: pytest -v -s quantization - - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] commands: - pip install aiohttp - bash run-benchmarks.sh - -- label: Documentation Build - working_dir: "/vllm-workspace/test_docs/docs" - no_gpu: True - commands: - - pip install -r requirements-docs.txt - - SPHINXOPTS=\"-W\" make html diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 3b5d36b246673..c992d41604b62 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,23 +2,6 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - {% for step in steps %} - label: "{{ step.label }}" agents: @@ -43,7 +26,7 @@ steps: limit: 5 plugins: - docker#v5.2.0: - image: {{ docker_image }} + image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:c5602f0baa8fa42df11853a0d422bc140cf04c9a" always-pull: true propagate-environment: true {% if not step.no_gpu %} From f0aa9d4b01d3b00131c433a6ca87ff817294b36b Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Jun 2024 18:58:36 +0000 Subject: [PATCH 2/3] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index c992d41604b62..3b5d36b246673 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,6 +2,23 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: + - label: ":docker: build image" + agents: + queue: cpu_queue + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." + - "docker push {{ docker_image }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + - wait + {% for step in steps %} - label: "{{ step.label }}" agents: @@ -26,7 +43,7 @@ steps: limit: 5 plugins: - docker#v5.2.0: - image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:c5602f0baa8fa42df11853a0d422bc140cf04c9a" + image: {{ docker_image }} always-pull: true propagate-environment: true {% if not step.no_gpu %} From b3b962f3dd7d0901da77bd9b67e22cb9fe525266 Mon Sep 17 00:00:00 2001 From: kevin Date: Mon, 10 Jun 2024 20:58:45 +0000 Subject: [PATCH 3/3] p Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 156 ++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 346fb1b63cdd3..b48ef31bc4163 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -4,9 +4,165 @@ # the final pipeline yaml file. steps: +- label: Regression Test + mirror_hardwares: [amd] + command: pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: AsyncEngine Test + #mirror_hardwares: [amd] + command: pytest -v -s async_engine + +- label: Basic Correctness Test + mirror_hardwares: [amd] + commands: + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +- label: Core Test + mirror_hardwares: [amd] + command: pytest -v -s core + +- label: Distributed Comm Ops Test + #mirror_hardwares: [amd] + command: pytest -v -s distributed/test_comm_ops.py + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + +- label: Distributed Tests + mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + commands: + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - pytest -v -s spec_decode/e2e/test_integration_dist.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + +- label: Distributed Tests (Multiple Groups) + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + commands: + - pytest -v -s distributed/test_pynccl.py + +- label: Engine Test + mirror_hardwares: [amd] + command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py + +- label: Entrypoints Test + mirror_hardwares: [amd] + + commands: + - pytest -v -s entrypoints -m llm + - pytest -v -s entrypoints -m openai + +- label: Examples Test + working_dir: "/vllm-workspace/examples" + mirror_hardwares: [amd] + commands: + # install aws cli for llava_example.py + # install tensorizer for tensorize_vllm_model.py + - pip install awscli tensorizer + - python3 offline_inference.py + - python3 offline_inference_with_prefix.py + - python3 llm_engine_example.py + - python3 llava_example.py + - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + +- label: Inputs Test + #mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s test_inputs.py + - pytest -v -s multimodal + +- label: Kernels Test %N + #mirror_hardwares: [amd] + command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 4 + +- label: Models Test + #mirror_hardwares: [amd] + commands: + - pytest -v -s models -m \"not llava\" + +- label: Llava Test + mirror_hardwares: [amd] + commands: + - bash ../.buildkite/download-images.sh + - pytest -v -s models -m llava + +- label: Prefix Caching Test + mirror_hardwares: [amd] + commands: + - pytest -v -s prefix_caching + +- label: Samplers Test + #mirror_hardwares: [amd] + command: pytest -v -s samplers + +- label: LogitsProcessor Test + mirror_hardwares: [amd] + command: pytest -v -s test_logits_processor.py + +- label: Utils Test + command: pytest -v -s test_utils.py + +- label: Worker Test + mirror_hardwares: [amd] + command: pytest -v -s worker + +- label: Speculative decoding tests + #mirror_hardwares: [amd] + commands: + # See https://github.com/vllm-project/vllm/issues/5152 + - export VLLM_ATTENTION_BACKEND=XFORMERS + - pytest -v -s spec_decode + +- label: LoRA Test %N + #mirror_hardwares: [amd] + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + parallelism: 4 + +- label: LoRA Long Context (Distributed) + #mirror_hardwares: [amd] + num_gpus: 4 + # This test runs llama 13B, so it is required to run on 4 GPUs. + commands: + - pytest -v -s -x lora/test_long_context.py + +- label: Tensorizer Test + #mirror_hardwares: [amd] + command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader + +- label: Metrics Test + mirror_hardwares: [amd] + command: pytest -v -s metrics + +- label: Quantization Test + #mirror_hardwares: [amd] + command: pytest -v -s quantization + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] commands: - pip install aiohttp - bash run-benchmarks.sh + +- label: Documentation Build + working_dir: "/vllm-workspace/test_docs/docs" + no_gpu: True + commands: + - pip install -r requirements-docs.txt + - SPHINXOPTS=\"-W\" make html