From 5e680272bfdb31892e482c94678af99755e6ebd2 Mon Sep 17 00:00:00 2001
From: kevin <kevin@anyscale.com>
Date: Mon, 10 Jun 2024 18:57:39 +0000
Subject: [PATCH 1/3] p

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/run-benchmarks.sh    |   2 +-
 .buildkite/test-pipeline.yaml   | 156 --------------------------------
 .buildkite/test-template-aws.j2 |  19 +---
 3 files changed, 2 insertions(+), 175 deletions(-)

diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index 75e9cf6a6579a..cbf6dda677c53 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -54,7 +54,7 @@ tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 
 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f buildkite-agent ]; then
+if [ ! -f /usr/bin/buildkite-agent ]; then
     exit 0
 fi
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b48ef31bc4163..346fb1b63cdd3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -4,165 +4,9 @@
 # the final pipeline yaml file.
 
 steps:
-- label: Regression Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
-- label: AsyncEngine Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s async_engine
-
-- label: Basic Correctness Test
-  mirror_hardwares: [amd]
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
-- label: Core Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s core
-
-- label: Distributed Comm Ops Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-
-- label: Distributed Tests
-  mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  commands:
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
-- label: Distributed Tests (Multiple Groups)
-  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  commands:
-  - pytest -v -s distributed/test_pynccl.py
-
-- label: Engine Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
-
-- label: Entrypoints Test
-  mirror_hardwares: [amd]
-
-  commands:
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
-
-- label: Examples Test
-  working_dir: "/vllm-workspace/examples"
-  mirror_hardwares: [amd]
-  commands:
-    # install aws cli for llava_example.py
-    # install tensorizer for tensorize_vllm_model.py
-    - pip install awscli tensorizer
-    - python3 offline_inference.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 llava_example.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
-- label: Inputs Test
-  #mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s test_inputs.py
-    - pytest -v -s multimodal
-
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
-
-- label: Models Test
-  #mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s models -m \"not llava\"
-
-- label: Llava Test
-  mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
-
-- label: Prefix Caching Test
-  mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s prefix_caching
-
-- label: Samplers Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s samplers
-
-- label: LogitsProcessor Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_logits_processor.py
-
-- label: Utils Test
-  command: pytest -v -s test_utils.py
-
-- label: Worker Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s worker
-
-- label: Speculative decoding tests
-  #mirror_hardwares: [amd]
-  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
-
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    - pytest -v -s -x lora/test_long_context.py
-
-- label: Tensorizer Test
-  #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
-
-- label: Metrics Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s metrics
-
-- label: Quantization Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s quantization
-
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
-
-- label: Documentation Build
-  working_dir: "/vllm-workspace/test_docs/docs"
-  no_gpu: True
-  commands:
-  - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 3b5d36b246673..c992d41604b62 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -2,23 +2,6 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
@@ -43,7 +26,7 @@ steps:
           limit: 5
     plugins:
       - docker#v5.2.0:
-          image: {{ docker_image }}
+          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:c5602f0baa8fa42df11853a0d422bc140cf04c9a"
           always-pull: true
           propagate-environment: true
           {% if not step.no_gpu %}

From f0aa9d4b01d3b00131c433a6ca87ff817294b36b Mon Sep 17 00:00:00 2001
From: kevin <kevin@anyscale.com>
Date: Mon, 10 Jun 2024 18:58:36 +0000
Subject: [PATCH 2/3] p

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-template-aws.j2 | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index c992d41604b62..3b5d36b246673 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -2,6 +2,23 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
+  - label: ":docker: build image"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
@@ -26,7 +43,7 @@ steps:
           limit: 5
     plugins:
       - docker#v5.2.0:
-          image: "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:c5602f0baa8fa42df11853a0d422bc140cf04c9a"
+          image: {{ docker_image }}
           always-pull: true
           propagate-environment: true
           {% if not step.no_gpu %}

From b3b962f3dd7d0901da77bd9b67e22cb9fe525266 Mon Sep 17 00:00:00 2001
From: kevin <kevin@anyscale.com>
Date: Mon, 10 Jun 2024 20:58:45 +0000
Subject: [PATCH 3/3] p

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-pipeline.yaml | 156 ++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 346fb1b63cdd3..b48ef31bc4163 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -4,9 +4,165 @@
 # the final pipeline yaml file.
 
 steps:
+- label: Regression Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s async_engine
+
+- label: Basic Correctness Test
+  mirror_hardwares: [amd]
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+- label: Core Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s core
+
+- label: Distributed Comm Ops Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s distributed/test_comm_ops.py
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+
+- label: Distributed Tests
+  mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  commands:
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+
+- label: Distributed Tests (Multiple Groups)
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - pytest -v -s distributed/test_pynccl.py
+
+- label: Engine Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+
+- label: Entrypoints Test
+  mirror_hardwares: [amd]
+
+  commands:
+  - pytest -v -s entrypoints -m llm
+  - pytest -v -s entrypoints -m openai
+
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  mirror_hardwares: [amd]
+  commands:
+    # install aws cli for llava_example.py
+    # install tensorizer for tensorize_vllm_model.py
+    - pip install awscli tensorizer
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+
+- label: Inputs Test
+  #mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
+- label: Kernels Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
+
+- label: Models Test
+  #mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s models -m \"not llava\"
+
+- label: Llava Test
+  mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models -m llava
+
+- label: Prefix Caching Test
+  mirror_hardwares: [amd]
+  commands:
+    - pytest -v -s prefix_caching
+
+- label: Samplers Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s test_logits_processor.py
+
+- label: Utils Test
+  command: pytest -v -s test_utils.py
+
+- label: Worker Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s worker
+
+- label: Speculative decoding tests
+  #mirror_hardwares: [amd]
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode
+
+- label: LoRA Test %N
+  #mirror_hardwares: [amd]
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+  parallelism: 4
+
+- label: LoRA Long Context (Distributed)
+  #mirror_hardwares: [amd]
+  num_gpus: 4
+  # This test runs llama 13B, so it is required to run on 4 GPUs.
+  commands:
+    - pytest -v -s -x lora/test_long_context.py
+
+- label: Tensorizer Test
+  #mirror_hardwares: [amd]
+  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
+- label: Metrics Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s metrics
+
+- label: Quantization Test
+  #mirror_hardwares: [amd]
+  command: pytest -v -s quantization
+
 - label: Benchmarks
   working_dir: "/vllm-workspace/.buildkite"
   mirror_hardwares: [amd]
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/test_docs/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html