Merge branch 'main' into reduce-asyncio-oh

vllm-project · Sep 4, 2024 · aa64042 · aa64042
2 parents d0f9641 + d331156
commit aa64042
Show file tree

Hide file tree

Showing 14 changed files with 609 additions and 247 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -1,36 +1,43 @@
 import os
+import sys
 import zipfile
 
-MAX_SIZE_MB = 250
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
 
 
 def print_top_10_largest_files(zip_file):
+    """Print the top 10 largest files in the given zip file."""
     with zipfile.ZipFile(zip_file, 'r') as z:
         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
         file_sizes.sort(key=lambda x: x[1], reverse=True)
         for f, size in file_sizes[:10]:
-            print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+            print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.")
 
 
 def check_wheel_size(directory):
+    """Check the size of .whl files in the given directory."""
     for root, _, files in os.walk(directory):
-        for f in files:
-            if f.endswith(".whl"):
-                wheel_path = os.path.join(root, f)
-                wheel_size = os.path.getsize(wheel_path)
-                wheel_size_mb = wheel_size / (1024 * 1024)
-                if wheel_size_mb > MAX_SIZE_MB:
-                    print(
-                        f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
-                        f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+        for file_name in files:
+            if file_name.endswith(".whl"):
+                wheel_path = os.path.join(root, file_name)
+                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
+                if wheel_size_mb > VLLM_MAX_SIZE_MB:
+                    print(f"Not allowed: Wheel {wheel_path} is larger "
+                          f"({wheel_size_mb:.2f} MB) than the limit "
+                          f"({VLLM_MAX_SIZE_MB} MB).")
                     print_top_10_largest_files(wheel_path)
                     return 1
                 else:
                     print(f"Wheel {wheel_path} is within the allowed size "
-                          f"({wheel_size_mb} MB).")
+                          f"({wheel_size_mb:.2f} MB).")
     return 0
 
 
 if __name__ == "__main__":
-    import sys
-    sys.exit(check_wheel_size(sys.argv[1]))
+    if len(sys.argv) < 2:
+        print("Usage: python check-wheel-size.py <directory>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+    sys.exit(check_wheel_size(directory))
diff --git a/Dockerfile b/Dockerfile
@@ -108,10 +108,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-# check the size of the wheel, we cannot upload wheels larger than 100MB
+# Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
-RUN python3 check-wheel-size.py dist
-
+# Default max size of the wheel is 250MB
+ARG VLLM_MAX_SIZE_MB=250
+ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
+ARG RUN_WHEEL_CHECK=true
+RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
+        python3 check-wheel-size.py dist; \
+    else \
+        echo "Skipping wheel size check."; \
+    fi
 #################### EXTENSION Build IMAGE ####################
 
 #################### DEV IMAGE ####################

diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
@@ -11,25 +11,33 @@
 from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 
-# Input audio and question
-audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate
-question = "What is recited in the audio?"
+audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+question_per_audio_count = [
+    "What is recited in the audio?",
+    "What sport and what nursery rhyme are referenced?"
+]
 
 
 # Ultravox 0.3
-def run_ultravox(question):
+def run_ultravox(question, audio_count):
     model_name = "fixie-ai/ultravox-v0_3"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
-        'role': 'user',
-        'content': f"<|reserved_special_token_0|>\n{question}"
+        'role':
+        'user',
+        'content':
+        "<|reserved_special_token_0|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name)
+    llm = LLM(model=model_name,
+              enforce_eager=True,
+              enable_chunked_prefill=False,
+              max_model_len=8192,
+              limit_mm_per_prompt={"audio": audio_count})
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
@@ -44,7 +52,9 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
-    llm, prompt, stop_token_ids = model_example_map[model](question)
+    audio_count = args.num_audios
+    llm, prompt, stop_token_ids = model_example_map[model](
+        question_per_audio_count[audio_count - 1], audio_count)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -53,23 +63,18 @@ def main(args):
                                      stop_token_ids=stop_token_ids)
 
     assert args.num_prompts > 0
-    if args.num_prompts == 1:
-        # Single inference
-        inputs = {
-            "prompt": prompt,
-            "multi_modal_data": {
-                "audio": audio_and_sample_rate
-            },
-        }
-
-    else:
+    inputs = {
+        "prompt": prompt,
+        "multi_modal_data": {
+            "audio": [
+                asset.audio_and_sample_rate
+                for asset in audio_assets[:audio_count]
+            ]
+        },
+    }
+    if args.num_prompts > 1:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                "audio": audio_and_sample_rate
-            },
-        } for _ in range(args.num_prompts)]
+        inputs = [inputs] * args.num_prompts
 
     outputs = llm.generate(inputs, sampling_params=sampling_params)
 
@@ -92,6 +97,11 @@ def main(args):
                         type=int,
                         default=1,
                         help='Number of prompts to run.')
+    parser.add_argument("--num-audios",
+                        type=int,
+                        default=1,
+                        choices=[1, 2],
+                        help="Number of audio items per prompt.")
 
     args = parser.parse_args()
     main(args)
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
@@ -6,6 +6,7 @@
 from vllm import LLM, RequestOutput, SamplingParams
 
 from ...conftest import cleanup
+from ..openai.test_vision import TEST_IMAGE_URLS
 
 MODEL_NAME = "facebook/opt-125m"
 
@@ -159,3 +160,36 @@ def test_chat():
     ]
     outputs = llm.chat(messages)
     assert len(outputs) == 1
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0