diff --git a/requirements-common.txt b/requirements-common.txt index d7e10c7591a79..4c5b681a0d5ab 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -22,9 +22,7 @@ typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 pyzmq msgspec -librosa # Required for audio processing -soundfile # Required for audio processing gguf == 0.9.1 importlib_metadata mistral_common >= 1.3.4 -pyyaml \ No newline at end of file +pyyaml diff --git a/requirements-test.txt b/requirements-test.txt index 46eb05fc31099..58cf1716b45ce 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -13,10 +13,12 @@ pytest-shard awscli einops # required for MPT, qwen-vl and Mamba httpx +librosa # required for audio test peft requests ray sentence-transformers # required for embedding +soundfile # required for audio test compressed-tensors==0.4.0 # required for compressed-tensors timm # required for internvl test transformers_stream_generator # required for qwen-vl test @@ -30,4 +32,4 @@ aiohttp # quantization bitsandbytes==0.42.0 -buildkite-test-collector==0.1.8 \ No newline at end of file +buildkite-test-collector==0.1.8 diff --git a/setup.py b/setup.py index 21b0422c0f0bd..38d3f41663f2e 100644 --- a/setup.py +++ b/setup.py @@ -501,6 +501,7 @@ def _read_requirements(filename: str) -> List[str]: ext_modules=ext_modules, extras_require={ "tensorizer": ["tensorizer>=2.9.0"], + "audio": ["librosa", "soundfile"] # Required for audio processing }, cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, package_data=package_data, diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py index 98de10aa08408..23008f9b8b563 100644 --- a/tests/models/test_ultravox.py +++ b/tests/models/test_ultravox.py @@ -1,11 +1,9 @@ from typing import List, Optional, Tuple, Type -import librosa import numpy as np import pytest from transformers import AutoModel, AutoTokenizer, BatchEncoding -from vllm.assets.audio import AudioAsset from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE @@ -21,6 +19,7 @@ @pytest.fixture(scope="session") def audio_and_sample_rate(): + from vllm.assets.audio import AudioAsset return AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -109,6 +108,7 @@ def process(hf_inputs: BatchEncoding): dtype=dtype, postprocess_inputs=process, auto_cls=AutoModel) as hf_model: + import librosa hf_outputs_per_audio = [ hf_model.generate_greedy_logprobs_limit( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 827a9493a70d2..7994945c5ac39 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -8,7 +8,6 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union, cast) -import librosa import numpy as np import torch import torch.utils.checkpoint @@ -107,6 +106,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): feature_extractor = whisper_feature_extractor(ctx) if sr != feature_extractor.sampling_rate: + try: + import librosa + except ImportError: + raise ImportError( + "Please install vllm[audio] for audio support.") from None audio = librosa.resample(audio, orig_sr=sr, target_sr=feature_extractor.sampling_rate) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 989b2e1a814c9..4bed267e99637 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,11 +1,9 @@ import base64 from functools import lru_cache from io import BytesIO -from typing import List, Optional, Tuple, TypeVar, Union +from typing import Any, List, Optional, Tuple, TypeVar, Union -import librosa import numpy as np -import soundfile from PIL import Image from vllm.connections import global_http_connection @@ -73,10 +71,22 @@ async def async_fetch_image(image_url: str, return image.convert(image_mode) +def try_import_audio_packages() -> Tuple[Any, Any]: + try: + import librosa + import soundfile + except ImportError: + raise ImportError( + "Please install vllm[audio] for audio support.") from None + return librosa, soundfile + + def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. """ + librosa, _ = try_import_audio_packages() + if audio_url.startswith("http"): audio_bytes = global_http_connection.get_bytes( audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT) @@ -95,6 +105,8 @@ async def async_fetch_audio( """ Asynchronously fetch audio from a URL. """ + librosa, _ = try_import_audio_packages() + if audio_url.startswith("http"): audio_bytes = await global_http_connection.async_get_bytes( audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT) @@ -123,6 +135,8 @@ def encode_audio_base64( sampling_rate: int, ) -> str: """Encode audio as base64.""" + _, soundfile = try_import_audio_packages() + buffered = BytesIO() soundfile.write(buffered, audio, sampling_rate, format="WAV")