From 6dc7283c74bdfc809f518144402f9976107a71ad Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:17:17 -0700 Subject: [PATCH] [Core] Add an environment variable which needs to be set explicitly to allow BlockSpaceManagerV1 (#9149) Signed-off-by: Sumit Dubey --- .buildkite/test-pipeline.yaml | 18 ++++++++++++------ benchmarks/benchmark_latency.py | 4 +++- benchmarks/benchmark_prefix_caching.py | 2 ++ benchmarks/benchmark_throughput.py | 1 + .../basic_correctness/test_chunked_prefill.py | 8 +++++++- tests/core/block/e2e/test_correctness.py | 7 +++++++ .../e2e/test_correctness_sliding_window.py | 7 +++++++ tests/core/test_chunked_prefill_scheduler.py | 7 +++++++ tests/core/test_scheduler.py | 7 +++++++ tests/prefix_caching/test_prefix_caching.py | 7 +++++++ tests/spec_decode/e2e/test_compatibility.py | 7 +++++++ tests/utils.py | 9 +++++++++ vllm/config.py | 12 ++++++++++++ vllm/envs.py | 6 ++++++ 14 files changed, 94 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 804b2fb2988f6..ccc5003e66beb 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -77,8 +77,8 @@ steps: - vllm/ - tests/basic_correctness/test_chunked_prefill commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min mirror_hardwares: [amd] @@ -88,7 +88,11 @@ steps: - vllm/distributed - tests/core commands: - - pytest -v -s core + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py + - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py - label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" @@ -185,7 +189,8 @@ steps: - vllm/ - tests/prefix_caching commands: - - pytest -v -s prefix_caching + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py + - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py - label: Samplers Test # 36min source_file_dependencies: @@ -209,7 +214,8 @@ steps: - tests/spec_decode commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py + - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] @@ -391,7 +397,7 @@ steps: - pytest -v -s ./compile/test_full_graph_multi_gpu.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 938d7acd5687c..79a48b2a1a845 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -221,7 +221,9 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument("--enable-prefix-caching", action='store_true', help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', action='store_true') + parser.add_argument('--use-v2-block-manager', + action='store_true', + default=EngineArgs.use_v2_block_manager) parser.add_argument( "--ray-workers-use-nsight", action='store_true', diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index eeb43a692076e..f14092d347343 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -33,6 +33,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -177,6 +178,7 @@ def main(args): help='enable prefix caching') parser.add_argument('--use-v2-block-manager', action='store_true', + default=EngineArgs.use_v2_block_manager, help='Use BlockSpaceMangerV2') parser.add_argument('--num-prompts', type=int, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 3781863f77e64..b7bc2a6402375 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -473,6 +473,7 @@ def main(args: argparse.Namespace): help="Maximum number of forward steps per scheduler call.") parser.add_argument("--use-v2-block-manager", action='store_true', + default=EngineArgs.use_v2_block_manager, help="Enable block manager v2.") parser.add_argument( "--enable-prefix-caching", diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 14c5447680729..e8819688c9e83 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -12,7 +12,7 @@ import pytest from ..models.utils import check_logprobs_close, check_outputs_equal -from ..utils import multi_gpu_test +from ..utils import check_deprecated_block_manager_usage, multi_gpu_test MODELS = [ "facebook/opt-125m", @@ -20,6 +20,12 @@ ] +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/basic_correctness/test_chunked_prefill.py') + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 033778d2c35e0..b3f626714d351 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -2,11 +2,18 @@ import pytest +from tests.utils import check_deprecated_block_manager_usage from vllm import SamplingParams from .conftest import get_token_ids_from_llm_generator +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/core/block/e2e/test_correctness.py') + + @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index e98292e807d73..731131984b0eb 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,6 +3,7 @@ import pytest +from tests.utils import check_deprecated_block_manager_usage from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator @@ -12,6 +13,12 @@ BLOCK_SIZE = 16 +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/core/block/e2e/test_correctness_sliding_window.py') + + @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 9dddd751c7858..c9495fd50d7c9 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -8,6 +8,7 @@ from vllm.core.scheduler import Scheduler from vllm.sequence import Logprob, SequenceGroup +from ..utils import check_deprecated_block_manager_usage from .utils import create_dummy_prompt @@ -27,6 +28,12 @@ def schedule_and_update_computed_tokens(scheduler): return metas, out +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/core/test_chunked_prefill_scheduler.py') + + @pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_simple(use_v2_block_manager: bool): """Verify basic scheduling works.""" diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 88c6c3bb28e43..5cdf743a4509c 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -12,11 +12,18 @@ from vllm.lora.request import LoRARequest from vllm.sequence import SequenceGroup, SequenceStatus +from ..utils import check_deprecated_block_manager_usage from .utils import (append_new_token, append_new_token_seq_group, create_dummy_prompt, get_sequence_groups, schedule_and_update_computed_tokens) +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + "tests/core/test_chunked_prefill_scheduler.py") + + @pytest.mark.parametrize('use_v2_block_manager', [True, False]) def test_scheduler_add_seq_group(use_v2_block_manager: bool): block_size = 4 diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 2dff84b812b89..88437425feb31 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -7,6 +7,7 @@ import pytest from tests.kernels.utils import override_backend_env_variable +from tests.utils import check_deprecated_block_manager_usage from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import CachedBlockAllocator from vllm.utils import Device @@ -18,6 +19,12 @@ ] +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/prefix_caching/test_prefix_caching.py') + + @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("num_blocks", [16]) def test_block_allocator( diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 9f0af211e264a..69ea81cfffed4 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,10 +1,17 @@ import pytest +from tests.utils import check_deprecated_block_manager_usage from vllm import SamplingParams from .conftest import get_output_from_llm_generator +@pytest.fixture(scope="module", autouse=True) +def check_deprecated_block_manager(): + check_deprecated_block_manager_usage( + 'tests/spec_decode/e2e/test_compatibility.py') + + @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/utils.py b/tests/utils.py index 115cab80691f0..924465057468f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -678,3 +678,12 @@ def get_client_text_logprob_generations( return [(text_generations, text, (None if x.logprobs is None else x.logprobs.top_logprobs)) for completion in completions for x in completion.choices] + + +def check_deprecated_block_manager_usage(test_name: str): + assert envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1 is True, ( + f"To allow the use of deprecated BlockSpaceManagerV1, set the " + f"environment variable VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. " + f"You can run the tests with: " + f"`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest {test_name}`" #noqa + ) diff --git a/vllm/config.py b/vllm/config.py index 7b3996dc90b94..91ba45798b4ba 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1037,6 +1037,18 @@ def _verify_args(self) -> None: f"({self.num_scheduler_steps}) must be greater than or " "equal to 1.") + if (not self.use_v2_block_manager \ + and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1): + raise ValueError( + "The use of BlockSpaceManagerV1 is deprecated and will " + "be removed in a future release. Please switch to " + "BlockSpaceManagerV2 by setting --use-v2-block-manager to " + "True. If you wish to suppress this error temporarily, " + "you can set the environment variable " + "`VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1. If your use " + "case is not supported in BlockSpaceManagerV2, please " + "file an issue with detailed information.") + @property def is_multi_step(self) -> bool: return self.num_scheduler_steps > 1 diff --git a/vllm/envs.py b/vllm/envs.py index f65f5c6bcc9bb..97767bf5b5ad9 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -64,6 +64,7 @@ VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False + VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1: bool = False def get_default_cache_root(): @@ -434,6 +435,11 @@ def get_default_config_root(): # and trust the driver's peer-to-peer capability report. "VLLM_SKIP_P2P_CHECK": lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1", + + # If set, allowing the use of deprecated block manager V1 + "VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1": + lambda: os.environ.get("VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1", "0" + ) == "1", } # end-env-vars-definition