diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 686f70dbece6c..69b6b146b3549 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -43,7 +43,7 @@ main() { - # The figures should be genereated by a separate process outside the CI/CD pipeline + # The figures should be generated by a separate process outside the CI/CD pipeline # # generate figures # python3 -m pip install tabulate pandas matplotlib diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index 3f38cf5137535..32bd34c431c89 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -301,6 +301,104 @@ run_serving_tests() { kill_gpu_processes } +run_genai_perf_tests() { + # run genai-perf tests + + # $1: a json file specifying genai-perf test cases + local genai_perf_test_file + genai_perf_test_file=$1 + + # Iterate over genai-perf tests + jq -c '.[]' "$genai_perf_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # prepend the current serving engine to the test name + test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tp') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + reuse_server=$(echo "$common_params" | jq -r '.reuse_server') + + # get client and server arguments + server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + if [[ $reuse_server == "true" ]]; then + echo "Reuse previous server for test case $test_name" + else + kill_gpu_processes + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ + "$server_params" "$common_params" + fi + + if wait_for_server; then + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." + else + echo "" + echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." + break + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps=$num_prompts + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + backend=$CURRENT_LLM_SERVING_ENGINE + + if [[ "$backend" == *"vllm"* ]]; then + backend="vllm" + fi + #TODO: add output dir. + client_command="genai-perf profile \ + -m $model \ + --service-kind openai \ + --backend vllm \ + --endpoint-type chat \ + --streaming \ + --url localhost:$port \ + --request-rate $qps \ + --num-prompts $num_prompts \ + " + + echo "Client command: $client_command" + + eval "$client_command" + + #TODO: process/record outputs + done + done + + kill_gpu_processes + +} prepare_dataset() { @@ -328,12 +426,17 @@ main() { pip install -U transformers + pip install -r requirements-dev.txt + which genai-perf + # check storage df -h ensure_installed wget ensure_installed curl ensure_installed jq + # genai-perf dependency + ensure_installed libb64-0d prepare_dataset @@ -345,6 +448,10 @@ main() { # run the test run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" + # run genai-perf tests + run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" + mv artifacts/ $RESULTS_FOLDER/ + # upload benchmark results to buildkite python3 -m pip install tabulate pandas python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json new file mode 100644 index 0000000000000..edbe9f2df0ce0 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "llama8B_tp1_genai_perf", + "qps_list": [4,8,16,32], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "tp": 1, + "port": 8000, + "num_prompts": 500, + "reuse_server": false + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "", + "gpu_memory_utilization": 0.9, + "num_scheduler_steps": 10, + "max_num_seqs": 512, + "dtype": "bfloat16" + }, + "genai_perf_input_parameters": { + } + } +] \ No newline at end of file diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 9925db7bea593..e19ace782feb5 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -83,6 +83,6 @@ function cpu_tests() { tests/lora/test_qwen2vl.py" } -# All of CPU tests are expected to be finished less than 25 mins. +# All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 74b287c7adbfa..d2b140e718501 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,7 +52,6 @@ steps: - tests/worker - tests/standalone_tests/lazy_torch_compile.py commands: - - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git # Used by multimoda processing test - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -107,7 +106,7 @@ steps: source_file_dependencies: - vllm/ commands: - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process @@ -126,11 +125,15 @@ steps: - tests/distributed - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile + - examples/offline_inference/rlhf.py commands: - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - python3 ../examples/offline_inference/rlhf.py - label: Metrics, Tracing Test # 10min num_gpus: 2 @@ -462,7 +465,10 @@ steps: - vllm/worker/worker_base.py - vllm/worker/worker.py - vllm/worker/model_runner.py + - entrypoints/llm/test_collective_rpc.py commands: + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 0226cf0ca00e9..0000000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Lint GitHub Actions workflows -on: - push: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - pull_request: - branches: - - "main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index 68149d2dc019f..0000000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clang-format - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - pull_request: - branches: - - main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - -jobs: - clang-format: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install clang-format==18.1.5 - - name: Running clang-format - run: | - EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' - ) - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index 68887adaae54b..0000000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: codespell - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml deleted file mode 100644 index 2f5ee8bbfd8c5..0000000000000 --- a/.github/workflows/doc-lint.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - doc-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/doc-lint.sh diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json deleted file mode 100644 index f6d4479ee1996..0000000000000 --- a/.github/workflows/matchers/ruff.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "ruff", - "pattern": [ - { - "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ] - } - ] - } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 73eeacf1fa562..0000000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: mypy - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'pyproject.toml' - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - '**/*.py' - # - '.github/workflows/mypy.yaml' - # - 'tools/mypy.sh' - # - 'pyproject.toml' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install mypy==1.11.1 - pip install types-setuptools - pip install types-PyYAML - pip install types-requests - pip install types-setuptools - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml deleted file mode 100644 index 4932af943a07b..0000000000000 --- a/.github/workflows/png-lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint PNG exports from excalidraw -on: - push: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - pull_request: - branches: - - "main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run png-lint.sh to check excalidraw exported images" - run: | - tools/png-lint.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..8c72a709cf330 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,17 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index 7266cc378cfb0..0000000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: ruff - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml - pull_request: - branches: - - main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - "**/*.py" - # - pyproject.toml - # - requirements-lint.txt - # - .github/workflows/matchers/ruff.json - # - .github/workflows/ruff.yml - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml deleted file mode 100644 index 4b1587e373e17..0000000000000 --- a/.github/workflows/shellcheck.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint shell scripts -on: - push: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - pull_request: - branches: - - "main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - shellcheck: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Check shell scripts" - run: | - tools/shellcheck.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index ff441f94435ad..0000000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: yapf - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - pull_request: - branches: - - main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000..8ea0f37885d9f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,73 @@ +repos: +- repo: https://github.com/google/yapf + rev: v0.32.0 + hooks: + - id: yapf + args: [--in-place, --verbose] + additional_dependencies: [toml] # TODO: Remove when yapf is upgraded +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.5 + hooks: + - id: ruff + args: [--output-format, github] +- repo: https://github.com/codespell-project/codespell + rev: v2.3.0 + hooks: + - id: codespell + exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*' +- repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.5 + hooks: + - id: clang-format + exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))' + types_or: [c++, cuda] + args: [--style=file, --verbose] +- repo: https://github.com/jackdewinter/pymarkdown + rev: v0.9.27 + hooks: + - id: pymarkdown + files: docs/.* +- repo: local + hooks: + - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.9 + entry: tools/mypy.sh 1 "3.9" + language: python + types: [python] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests] + - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.10 + entry: tools/mypy.sh 1 "3.10" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.11 + entry: tools/mypy.sh 1 "3.11" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward + name: Run mypy for Python 3.12 + entry: tools/mypy.sh 1 "3.12" + language: python + types: [python] + additional_dependencies: *mypy_deps + - id: shellcheck + name: Lint shell scripts + entry: tools/shellcheck.sh + language: script + types: [shell] + - id: png-lint + name: Lint PNG exports from excalidraw + entry: tools/png-lint.sh + language: script + types: [png] +- repo: https://github.com/rhysd/actionlint + rev: v1.7.6 + hooks: + - id: actionlint diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f163edc27cba8..ebe226cf6d148 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -26,10 +26,10 @@ RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace -COPY requirements-build.txt requirements-build.txt ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ pip install --upgrade pip && \ pip install -r requirements-build.txt @@ -37,9 +37,9 @@ FROM cpu-test-1 AS build WORKDIR /workspace/vllm -COPY requirements-common.txt requirements-common.txt -COPY requirements-cpu.txt requirements-cpu.txt RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt COPY . . diff --git a/Dockerfile.rocm b/Dockerfile.rocm index e733994f8c33e..e922cb207b899 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -51,10 +51,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ *"rocm-6.2"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ - torch==2.6.0.dev20241113+rocm6.2 \ + torch \ 'setuptools-scm>=8' \ - torchvision==0.20.0.dev20241113+rocm6.2 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ + torchvision \ + --extra-index-url https://download.pytorch.org/whl/rocm6.2;; \ *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer diff --git a/SECURITY.md b/SECURITY.md index de0032d26c87b..47196a1f1221e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 9d71e4ecc4a37..a9ab4fc9b621e 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -22,6 +22,7 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None @@ -78,7 +79,7 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue @@ -235,7 +236,8 @@ async def async_request_openai_completions( async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, @@ -328,7 +330,8 @@ async def async_request_openai_chat_completions( if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..53186e10c5452 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -525,6 +525,7 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], @@ -553,6 +554,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -573,6 +575,7 @@ async def benchmark( if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -616,6 +619,7 @@ async def limited_request_func(request_func_input, pbar): async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -780,6 +784,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode @@ -877,6 +882,7 @@ def main(args: argparse.Namespace): api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, @@ -1222,5 +1228,12 @@ def main(args: argparse.Namespace): 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py new file mode 100644 index 0000000000000..e1f613e1da509 --- /dev/null +++ b/benchmarks/kernels/benchmark_lora.py @@ -0,0 +1,1147 @@ +import argparse +import copy +import json +import pickle +import time +from dataclasses import dataclass +from enum import Enum, auto +from itertools import product +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement +from utils import ArgPool, Bench, CudaGraphBenchParams +from weight_shapes import WEIGHT_SHAPES + +from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand +from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice +from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink +from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand +from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink +from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.utils import FlexibleArgumentParser + +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) +DEFAULT_TP_SIZES = [1] +DEFAULT_BATCH_SIZES = [ + 1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024, + 2048, 3072, 4096, 5120, 6144, 7168, 8192 +] +DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384] +DEFAULT_LORA_RANKS = [16] +DEFAULT_NUM_LORAS = [1, 2, 3, 4] +DEFAULT_SORT_BY_LORA_IDS = [False, True] +DEFAULT_SEQ_LENGTHS = [1] +DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False] + + +# Utilities +def dtype_to_str(dtype: torch.dtype): + if dtype == torch.float16: + return "f16" + if dtype == torch.bfloat16: + return "bf16" + if dtype == torch.float32: + return "f32" + raise ValueError(f"Unsupported dtype {dtype}") + + +def make_rand_lora_weight_tensor(k: int, + n: int, + num_loras: int, + dtype: torch.dtype, + device: str = "cuda") -> torch.Tensor: + + # LoRA weights column major + return torch.rand((num_loras, n, k), dtype=dtype).to(device) + + +def make_rand_tensors( + a_shape: Tuple[int], + b_shape: Tuple[int], + c_shape: Tuple[int], + a_dtype: torch.dtype, + b_dtype: torch.dtype, + c_dtype: torch.dtype, + num_slices: int, + device: str = "cuda", +) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: + """ + Make LoRA input/output matrices. + """ + A = torch.rand(a_shape, dtype=a_dtype).to(device) + + # LoRA weights column major + Bs = [ + torch.rand(b_shape, dtype=b_dtype).to(device) + for _ in range(num_slices) + ] + + C = torch.zeros(c_shape, dtype=c_dtype).to(device) + return A, Bs, C + + +def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int, + sort_by_lora_id: bool, + device: str) -> torch.Tensor: + """ + All prompts are mapped to a Lora ID in range [0, num_active_loras). + where 0 refers to first lora, 1 refers to second lora and so on. + """ + assert num_active_loras > 0 + + if not sort_by_lora_id: + return torch.randint(0, + num_active_loras, (num_prompts, ), + dtype=torch.long) + + # Divide LoRAs equally and in order. + part_size = num_prompts // num_active_loras + part_size = max(part_size, 1) + + lora_id = 0 + prompt_lora_mapping = [] + while len(prompt_lora_mapping) < num_prompts: + prompt_lora_mapping.extend([lora_id] * part_size) + lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id + return torch.tensor(prompt_lora_mapping[:num_prompts], + dtype=torch.long, + device=device) + + +def make_token_lora_mapping(num_tokens: int, num_prompts: int, + prompt_lora_mapping: torch.Tensor, + seq_len_tensor: torch.Tensor, device: str): + """ + Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor + """ + assert prompt_lora_mapping.shape[0] == num_prompts + + # token to lora index mapping + token_lora_mapping = [0] * num_tokens + current_offset = 0 + for b_id in range(num_prompts): + lora_index = prompt_lora_mapping[b_id].item() + s = current_offset + e = s + seq_len_tensor[b_id].item() + token_lora_mapping[s:e] = [lora_index] * (e - s) + current_offset += seq_len_tensor[b_id].item() + + return torch.tensor(token_lora_mapping, dtype=torch.long, device=device) + + +def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + seq_lens_cpu: torch.Tensor, + prompt_lora_mapping_cpu: torch.Tensor, scaling: float, + add_inputs: Optional[bool]): + """ + Torch group gemm reference implementation to test correctness of + benchmarking operations. + """ + batches = seq_lens_cpu.size(0) + out_list = [] + current_offset = 0 + for lora_index, b_length in zip(range(batches), seq_lens_cpu): + x = input[current_offset:b_length + current_offset, :] + current_offset += b_length + w = lora_weights[prompt_lora_mapping_cpu[lora_index]] + result = torch.nn.functional.linear(x, w) + result *= scaling + out_list.append(result) + torch.cat(out_list, dim=0) + + cat_result = torch.cat(out_list, dim=0) + + if add_inputs: + ref_out += cat_result + else: + ref_out.copy_(cat_result) + + +class OpType(Enum): + """ + LoRA Ops to benchmark and its properties. + """ + SGMV_SHRINK = auto() + BGMV_SHRINK = auto() + SGMV_EXPAND = auto() + BGMV_EXPAND = auto() + BGMV_EXPAND_SLICE = auto() + + @staticmethod + def from_str(s: str) -> "OpType": + if s.lower() == 'sgmv_shrink': + return OpType.SGMV_SHRINK + if s.lower() == 'sgmv_expand': + return OpType.SGMV_EXPAND + if s.lower() == 'bgmv_shrink': + return OpType.BGMV_SHRINK + if s.lower() == 'bgmv_expand': + return OpType.BGMV_EXPAND + if s.lower() == "bgmv_expand_slice": + return OpType.BGMV_EXPAND_SLICE + raise ValueError(f"Unrecognized str {s} to convert to OpType") + + def is_shrink_fn(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK] + + def is_expand_fn(self) -> bool: + return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND] + + def is_prefill_op(self) -> bool: + return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND] + + def is_decode_op(self) -> bool: + return self in [ + OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE + ] + + def is_expand_slice_fn(self) -> bool: + return self in [OpType.BGMV_EXPAND_SLICE] + + def num_slices(self) -> List[int]: + if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]: + # SGMV kernels supports slices + return [1, 2, 3] + if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]: + return [1] + if self in [OpType.BGMV_EXPAND_SLICE]: + return [2, 3] + raise ValueError(f"Unrecognized OpType {self}") + + def mkn(self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int) -> Tuple[int, int, int]: + num_tokens = batch_size * seq_length + if self.is_shrink_fn(): + m = num_tokens + k = hidden_size + n = lora_rank + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + m = num_tokens + k = lora_rank + n = hidden_size + return m, k, n + + def matmul_dtypes( + self, op_dtype: torch.dtype + ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]: + """ + return a type, b type and c type for A x B = C + """ + if self.is_shrink_fn(): + return op_dtype, op_dtype, torch.float32 + else: + assert self.is_expand_fn() or self.is_expand_slice_fn() + return torch.float32, op_dtype, op_dtype + + def matmul_shapes( + self, batch_size: int, seq_length: int, hidden_size: int, + lora_rank: int, num_loras: int, + num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]: + """ + Given num_slices, return the shapes of the A, B, and C matrices + in A x B = C, for the op_type + """ + m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank) + + b_shape = (num_loras, n, k) # col-major + if self == OpType.SGMV_SHRINK: + # SGMV shrink supports num_slices inherently in the kernel + return ((m, k), b_shape, (num_slices, m, n)) + if self == OpType.SGMV_EXPAND: + # SGMV expand supports num_slices inherently in the kernel + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + if self == OpType.BGMV_SHRINK: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND: + return ((m, k), b_shape, (m, n)) + if self == OpType.BGMV_EXPAND_SLICE: + return ((num_slices, m, k), b_shape, (m, n * num_slices)) + + raise ValueError(f"Unrecognized op_type {self}") + + def bench_fn(self) -> Callable: + + def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]): + for x in kwargs_list: + bgmv_expand_slice(**x) + + if self == OpType.SGMV_SHRINK: + return sgmv_shrink + if self == OpType.SGMV_EXPAND: + return sgmv_expand + if self == OpType.BGMV_SHRINK: + return bgmv_shrink + if self == OpType.BGMV_EXPAND: + return bgmv_expand + if self == OpType.BGMV_EXPAND_SLICE: + return emulate_bgmv_expand_slice + raise ValueError(f"Unrecognized optype {self}") + + def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor, + lora_weights: List[torch.Tensor], + **kwargs) -> Callable: + """Each benchmark operation expected the input, lora_weights and outputs + in a slightly different format. Refer to self.matmul_shapes(). + run_ref_group_gemm accounts for those differences in executing a + reference group gemm for correctness testing. + """ + w_dtype = lora_weights[0].dtype + num_slices = len(lora_weights) + if self == OpType.SGMV_SHRINK: + for slice_idx in range(num_slices): + ref_group_gemm(ref_out=output[slice_idx, :], + input=input, + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.SGMV_EXPAND: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + if self == OpType.BGMV_SHRINK: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input, + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND: + assert num_slices == 1 + ref_group_gemm(ref_out=output, + input=input.clone().to(dtype=w_dtype), + lora_weights=lora_weights[0], + **kwargs) + if self == OpType.BGMV_EXPAND_SLICE: + hidden_size = lora_weights[0].shape[1] + for slice_idx in range(num_slices): + slice_offset = slice_idx * hidden_size + ref_group_gemm( + ref_out=output[:, slice_offset:slice_offset + hidden_size], + input=input[slice_idx].clone().to(dtype=w_dtype), + lora_weights=lora_weights[slice_idx], + **kwargs) + raise ValueError(f"Unrecognized optype {self}") + + +@dataclass +class BenchmarkContext: + """ + LoRA benchmark context + """ + batch_size: int + hidden_size: int + num_loras: int + num_active_loras: int + lora_rank: int + sort_by_lora_id: bool + dtype: torch.dtype + seq_length: Optional[int] = None + num_slices: Optional[int] = None # num_slices for slice based ops + + def with_seq_length(self, seq_length: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.seq_length = seq_length + return ctx + + def with_num_slices(self, num_slices: int) -> "BenchmarkContext": + ctx = copy.copy(self) + ctx.num_slices = num_slices + return ctx + + def bench_label(self) -> str: + return f"lora-{self.dtype}" + + def bench_sublabel(self, op_type: OpType) -> str: + m, k, n = op_type.mkn(self.batch_size, self.seq_length, + self.hidden_size, self.lora_rank) + desc = { + 'bs': self.batch_size, + 'sl': self.seq_length, + 'm': m, + 'k': k, + 'n': n, + 'num_loras': self.num_loras, + 'sort_by_lora': self.sort_by_lora_id, + 'num_slices': self.num_slices, + } + return json.dumps(desc) + + +@dataclass +class BenchmarkTensors: + """ + Input/Output tensors used for benchmarks + """ + # matmul tensors + input: torch.Tensor + lora_weights_lst: List[torch.Tensor] + output: torch.Tensor + # metadata tensors + seq_lens: torch.Tensor + seq_start_loc: torch.Tensor + prompt_lora_mapping: torch.Tensor + token_lora_mapping: torch.Tensor + + def io_types(self) -> str: + return (f"{dtype_to_str(self.input.dtype)}x" + f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>" + f"{dtype_to_str(self.output.dtype)}") + + @staticmethod + def make(ctx: BenchmarkContext, + op_type: OpType, + device: str = "cuda") -> "BenchmarkTensors": + + # Make input / output matmul tensors. + a_shape, b_shape, c_shape = op_type.matmul_shapes( + ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank, + ctx.num_loras, ctx.num_slices) + a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype) + input_tensor, lora_weights, output_tensor = \ + make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type, + num_slices = ctx.num_slices) + + # Make metadata tensors. + # Keep the metadata tensors in the CPU for further processing if needed. + # The tensors get moved to the GPU before benchmarking. + assert ctx.num_active_loras <= ctx.num_loras + total_tokens = ctx.batch_size * ctx.seq_length + + # Prepare seq lens tensor + seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1, + (ctx.batch_size, )) + # Prepare seq_start_loc tensor + seq_start_loc_tensor = torch.cumsum(torch.tensor( + [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long), + dim=0) + assert total_tokens == seq_len_tensor.sum() + # Prepare prompt lora indices tensor + prompt_lora_indices_tensor = make_prompt_lora_mapping( + ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu") + # Prepare token lora indices tensor + token_lora_indices_tensor = make_token_lora_mapping( + total_tokens, ctx.batch_size, prompt_lora_indices_tensor, + seq_len_tensor, "cpu") + + return BenchmarkTensors(input_tensor, lora_weights, output_tensor, + seq_len_tensor, seq_start_loc_tensor, + prompt_lora_indices_tensor, + token_lora_indices_tensor) + + def sanity_check(self) -> None: + """ + Fails asserts when non-conformality is detected. + """ + num_tokens = self.input.shape[-2] + # check metadata tensors + assert torch.sum(self.seq_lens) == num_tokens + num_seqs = self.seq_lens.shape[0] + assert self.seq_start_loc.shape[0] == num_seqs + assert self.prompt_lora_mapping.shape[0] == num_seqs + assert self.token_lora_mapping.shape[0] == num_tokens + + def to_device(self, device: str): + """ + Transfer tensors to device if the tensors aren't already on the device + """ + + def to_device(tensor: torch.Tensor): + if tensor.device != device: + tensor = tensor.to(device=device) + return tensor + + self.input = to_device(self.input) + self.output = to_device(self.output) + self.seq_lens = to_device(self.seq_lens) + self.seq_start_loc = to_device(self.seq_start_loc) + self.prompt_lora_mapping = to_device(self.prompt_lora_mapping) + self.token_lora_mapping = to_device(self.token_lora_mapping) + for i in range(len(self.lora_weights_lst)): + self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i]) + + def metadata(self) -> Tuple[int, int, int]: + """ + Return num_seqs, num_tokens and max_seq_len + """ + num_seqs = self.seq_lens.shape[0] + num_tokens = self.token_lora_mapping.shape[0] + max_seq_len = torch.max(self.seq_lens).item() + num_slices = len(self.lora_weights_lst) + return num_seqs, num_tokens, max_seq_len, num_slices + + def convert_to_sgmv_benchmark_tensors(self): + """ + For sgmv punica kernels, when consecutive sequences have the + same LoRA ID, we just merge them together. + This happens in punica.py::compute_metadata + """ + + # Collapse seq_lens and seq_start_loc + _, seq_lens = torch.unique_consecutive(self.token_lora_mapping, + return_counts=True) + cum_result = torch.cumsum(seq_lens, dim=0) + seq_start_loc = torch.zeros_like(seq_lens) + seq_start_loc[1:].copy_(cum_result[:-1]) + + # Collapse prompt mapping + prompt_lora_mapping = torch.unique_consecutive( + self.prompt_lora_mapping) + + assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \ + f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}" + + self.prompt_lora_mapping = prompt_lora_mapping.to( + dtype=self.prompt_lora_mapping.dtype) + self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype) + self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype) + + def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]: + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_slices, num_tokens, lora_rank] + assert len(o_shape) == 3 + assert o_shape == (num_slices, num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'scaling': 1.0, + } + + def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + self.convert_to_sgmv_benchmark_tensors() + self.sanity_check() + self.to_device(self.input.device) + + num_seqs, num_tokens, max_seq_len, num_slices = self.metadata() + + # Sanity check matrix shapes. + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape : [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape : [num_lora, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape : [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst, + 'output_tensor': self.output, + 'b_seq_start_loc': self.seq_start_loc, + 'seq_len_tensor': self.seq_lens, + 'lora_indices_tensor': self.prompt_lora_mapping, + 'batches': num_seqs, + 'max_seq_length': max_seq_len, + 'token_nums': num_tokens, + 'offset_start': 0, + 'add_inputs': add_inputs, + } + + def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]: + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, hidden_size] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + hidden_size = i_shape[1] + # Expected lora weight shape [num_loras, lora_rank, hidden_size] + assert len(lw_shape) == 3 + assert lw_shape[2] == hidden_size + lora_rank = lw_shape[1] + # Expected output shape [num_tokens, lora_rank] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, lora_rank) + + return { + 'inputs': self.input, + 'lora_a_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'scaling': 1.0 + } + + def as_bgmv_expand_kwargs(self, add_inputs: bool): + assert len(self.lora_weights_lst) == 1 + self.to_device(self.input.device) + + _, num_tokens, _, _ = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_tokens, lora_rank] + assert len(i_shape) == 2 + assert i_shape[0] == num_tokens + lora_rank = i_shape[1] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size) + + return { + 'inputs': self.input, + 'lora_b_weights': self.lora_weights_lst[0], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'add_inputs': add_inputs + } + + def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]: + + _, num_tokens, _, num_slices = self.metadata() + # Sanity check shapes + i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[ + 0].shape, self.output.shape + # Expected input shape [num_slices, num_tokens, lora_rank] + assert len(i_shape) == 3 + assert i_shape[0] == num_slices + assert i_shape[1] == num_tokens + lora_rank = i_shape[2] + # Expected lora weight shape [num_loras, hidden_size, lora_rank] + assert len(lw_shape) == 3 + assert lw_shape[2] == lora_rank + hidden_size = lw_shape[1] + # Expected output shape [num_tokens, hidden_size * num_slices] + assert len(o_shape) == 2 + assert o_shape == (num_tokens, hidden_size * num_slices) + + self.to_device(self.input.device) + + kwargs_list = [] + for i in range(num_slices): + kwargs_list.append({ + 'inputs': self.input[i], + 'lora_b_weights': self.lora_weights_lst[i], + 'output_tensor': self.output, + 'lora_indices_tensor': self.token_lora_mapping, + 'slice_offset': i * hidden_size, + 'slice_size': hidden_size, + 'add_inputs': add_inputs, + }) + return {'kwargs_list': kwargs_list} + + def bench_fn_kwargs(self, + op_type: OpType, + add_inputs: Optional[bool] = None) -> Dict[str, Any]: + if op_type.is_shrink_fn(): + assert add_inputs is None + else: + assert add_inputs is not None + + if op_type == OpType.SGMV_SHRINK: + return self.as_sgmv_shrink_kwargs() + if op_type == OpType.SGMV_EXPAND: + return self.as_sgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_SHRINK: + return self.as_bgmv_shrink_kwargs() + if op_type == OpType.BGMV_EXPAND: + return self.as_bgmv_expand_kwargs(add_inputs) + if op_type == OpType.BGMV_EXPAND_SLICE: + return self.as_bgmv_expand_slice_kwargs(add_inputs) + raise ValueError(f"Unrecognized optype {self}") + + def test_correctness(self, op_type: OpType, + expand_fn_add_inputs: Optional[bool]) -> bool: + """ + Test correctness of op_type implementation against a grouped gemm + reference implementation. + """ + seq_lens_cpu = self.seq_lens.to(device="cpu") + prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu") + ref_output = self.output.clone() + + self.output.zero_() + op_type.bench_fn()( + **self.bench_fn_kwargs(op_type, expand_fn_add_inputs)) + + op_type.run_ref_group_gemm( + ref_output, + self.input, + self.lora_weights_lst, + seq_lens_cpu=seq_lens_cpu, + prompt_lora_mapping_cpu=prompt_lora_mapping_cpu, + scaling=1.0, + add_inputs=expand_fn_add_inputs) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[self.output.dtype] + + return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol) + + +def bench_optype(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None, + expand_fn_add_inputs: Optional[bool] = None, + test_correctness: bool = False) -> TMeasurement: + + assert arg_pool_size >= 1 + if op_type.is_shrink_fn(): + assert expand_fn_add_inputs is None + else: + assert expand_fn_add_inputs is not None + + # BenchmarkContext -> BenchmarkTensors + bench_tensors : List[BenchmarkTensors] = \ + [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)] + for bt in bench_tensors: + bt.sanity_check() + + # Test correctness of our implementation. + if test_correctness: + assert all([ + bt.test_correctness(op_type, expand_fn_add_inputs) + for bt in bench_tensors + ]) + + # BenchmarkTensors -> Dict (kwargs) + kwargs_list = [ + bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs) + for bt in bench_tensors + ] + + # Clear LoRA optimization hash-maps. + _LORA_A_PTR_DICT.clear() + _LORA_B_PTR_DICT.clear() + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + for kwargs in kwargs_list: + op_type.bench_fn()(**kwargs) + torch.cuda.synchronize() + + # Merge into a single kwargs and qualify arguments as ArgPool + kwargs = {k: ArgPool([]) for k in kwargs_list[0]} + for _kwargs in kwargs_list: + for k, v in _kwargs.items(): + kwargs[k].values.append(v) + + describe_args = (f"add_inputs={expand_fn_add_inputs}" + if expand_fn_add_inputs is not None else "") + description = ( + f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})") + + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + timer = None + with Bench(cuda_graph_params, + ctx.bench_label(), ctx.bench_sublabel(op_type), description, + op_type.bench_fn(), **kwargs) as bench: + timer = bench.run() + return timer + + +def bench_torch_mm(ctx: BenchmarkContext, + arg_pool_size: int, + op_type: OpType, + cuda_graph_nops: Optional[int] = None) -> TMeasurement: + """ + Benchmark basic torch.mm as a roofline. + + When all the input tokens have the same LoRA ID, the LoRA kernels are just + a matmul. This torch.mm benchmark serves as a roofline for that case. + + input op_type is used in determining the m, k, n dimensions for the matmul. + """ + + batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size, + ctx.hidden_size, + ctx.lora_rank, + ctx.seq_length, + ctx.dtype) + + m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank) + # For a fairer comparison. + n = n * ctx.num_slices + + # Get matmul input and output tensors for A x B = C + As, Bs, Cs = [], [], [] + for _ in range(arg_pool_size): + As.append(torch.rand((m, k), dtype=dtype).to("cuda")) + Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t()) + Cs.append(torch.rand((m, n), dtype=dtype).to("cuda")) + + # Make torch.mm kwargs + mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)} + + description = ( + f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}" + f"x{dtype_to_str(dtype)}" + f"=>{dtype_to_str(dtype)})") + cuda_graph_params = None + if cuda_graph_nops: + cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops) + with Bench(cuda_graph_params, ctx.bench_label(), + ctx.bench_sublabel(op_type), description, torch.mm, + **mm_kwargs) as bench: + return bench.run() + + +# runner +def use_cuda_graph_recommendation() -> str: + return """ + Triton kernels have a significant launch overhead with + launched directly via python. This overhead is more noticeable + for small the problem sizes. For these cases, it is recommended + to use the script with `--cuda-graph-nops N` to benchmark N + consecutive invocations of the benchmarking operations from + inside a CUDA Graph. Note that the returned measurement is for N + invocations of the operation. + """ + + +def print_timers(timers: List[TMeasurement], + args: Optional[argparse.Namespace] = None): + compare = TBenchmark.Compare(timers) + compare.print() + + if args and args.cuda_graph_nops: + print( + f"Note : The timings reported above is for {args.cuda_graph_nops} " + "consecutive invocations of the benchmarking functions. " + f"Please divide by {args.cuda_graph_nops} for single invocation " + "timings.") + + print("Note on Comparison with torch.mm : The torch.mm numbers are " + "benchmark numbers of a simple matmul emulating the single lora " + "case. It is provided as a roofline for comparing our LoRA Kernel " + "implementations. It is expected that the LoRA kernels will be " + "slower than torch.mm in cases where num_loras is big. But for " + "small num_loras the goal should be to match the torch.mm numbers.") + + +def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]): + + if args.cuda_graph_nops is not None: + assert args.cuda_graph_nops > 0 + print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA " + "Graph") + else: + print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}") + + timers = [] + for bench_ctx in bench_ctxs: + for seq_len in args.seq_lengths: + bench_ops: List[OpType] = [] + if seq_len == 1: + # bench all decode ops + bench_ops = [op for op in args.op_types if op.is_decode_op()] + else: + # bench all prefill ops + bench_ops = [op for op in args.op_types if op.is_prefill_op()] + + seq_len_timers = [] + for bench_op in bench_ops: + for num_slices in bench_op.num_slices(): + _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices( + num_slices) + # Benchmark torch.mm as a roofline + seq_len_timers.append( + bench_torch_mm(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops)) + + # Benchmark bench_op + expand_fn_add_inputs = [ + None + ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs + for add_input_arg in expand_fn_add_inputs: + seq_len_timers.append( + bench_optype(_ctx, args.arg_pool_size, bench_op, + args.cuda_graph_nops, add_input_arg, + args.test_correctness)) + + print_timers(seq_len_timers) + timers.extend(seq_len_timers) + + # Result stdout dump + print("== All Results ====") + print_timers(timers, args) + + if args.output_directory: + # Result file dump + od = Path(args.output_directory) + if not od.exists(): + od.mkdir() + + timestamp = int(time.time()) + pkl_file = od / f"lora_bench-{timestamp}.pkl" + print(f"Writing benchmarks to {pkl_file}") + with open(pkl_file, "wb") as f: + pickle.dump(timers, f) + + +def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int], + args: argparse.Namespace) -> List[BenchmarkContext]: + + ctxs: List[BenchmarkContext] = [] + for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa + args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras, + args.sort_by_lora_id): + ctxs.append( + BenchmarkContext( + batch_size=batch_size, + hidden_size=hidden_size, + lora_rank=lora_rank, + num_loras=num_loras, + num_active_loras=args.num_active_loras + if args.num_active_loras else num_loras, + # To be filled based on the OpType to benchmark + seq_length=None, + sort_by_lora_id=sort_by_lora_id, + dtype=args.dtype, + # To be filled based on the OpType to benchmark + num_slices=None)) + + return ctxs + + +def run_list_bench(args: argparse.Namespace): + print(args) + + print("List bench :\n" + f" Hidden Sizes {args.hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_range_bench(args: argparse.Namespace): + print(args) + + hidden_sizes = list( + range(args.hidden_sizes_start, args.hidden_sizes_end + 1, + args.hidden_sizes_increment)) + lora_ranks = list( + range(args.lora_ranks_start, args.lora_ranks_end + 1, + args.lora_ranks_increment)) + + print("Range bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args) + + run(args, bench_contexts) + + +def run_model_bench(args: argparse.Namespace): + print(args) + + def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]: + hidden_sizes = set() + for KN, tp_split_dim in WEIGHT_SHAPES[model]: + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + hidden_sizes.add(KN[1]) + return hidden_sizes + + # Get all hidden sizes + hidden_sizes: set[int] = set() + for model_name, tp_size in product(args.models, args.tp_sizes): + hidden_sizes = hidden_sizes.union( + hidden_sizes_from_model(model_name, tp_size)) + + print("Model bench :\n" + f" Hidden Sizes {hidden_sizes}" + f" LoRA Ranks {args.lora_ranks}") + + # Get all benchmarking contexts + bench_contexts: List[BenchmarkContext] = as_benchmark_contexts( + hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args) + + run(args, bench_contexts) + + +if __name__ == '__main__': + + def to_torch_dtype(dt): + if dt == "torch.float16": + return torch.float16 + if dt == "torch.bfloat16": + return torch.bfloat16 + raise ValueError("unsupported dtype") + + def get_bool(s: str) -> bool: + return s.lower() in ['true', '1'] + + def add_common_command_args(p: argparse.ArgumentParser): + p.add_argument( + "--dtype", + type=to_torch_dtype, + required=True, + help="Available options are ['torch.float16', 'torch.bfloat16']") + + p.add_argument( + "--arg-pool-size", + type=int, + default=32, + help="Run profiles with a pool of input/output/meta tensors instead" + "of simply reusing the same tensors for all runs. A bigger arg-pool" + "mitigates hardware caching effects during benchmarking.") + + p.add_argument( + "--cuda-graph-nops", + type=int, + help=("when set profiling is done using cudagraph, " + "with the given number of operations in a graph." + "Note that the measurement returned is the time " + "taken for N consecutive executions of the benchmarking " + "functions, where N is the value of this argument.")) + p.add_argument("--num-loras", + nargs="+", + type=int, + default=DEFAULT_NUM_LORAS) + p.add_argument("--num-active-loras", + type=int, + default=None, + help="Active LoRAs. When None, all LoRAs are active") + p.add_argument("--sort-by-lora-id", + nargs="+", + type=get_bool, + default=DEFAULT_SORT_BY_LORA_IDS) + p.add_argument("--op-types", + nargs="+", + type=OpType.from_str, + default=list(OpType)) + p.add_argument('--seq-lengths', + nargs="+", + type=int, + default=DEFAULT_SEQ_LENGTHS) + p.add_argument("--batch-sizes", + nargs="+", + type=int, + default=DEFAULT_BATCH_SIZES) + p.add_argument("--expand-fn-add-inputs", + nargs="+", + type=get_bool, + default=DEFAULT_EXPAND_FN_ADD_INPUTS) + p.add_argument( + '-o', + '--output-directory', + type=str, + help=("Output directory to store a the list of benchmarking" + "TMeasurement objects as a pickle file")) + + p.add_argument( + "--test-correctness", + action='store_true', + help=("When enabled, the benchmarking functions are tested" + "for correctness before the actual benchmarking")) + + parser = FlexibleArgumentParser( + description=f""" +Benchmark LoRA kernels: + {use_cuda_graph_recommendation()} + + list_bench example: + python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + model_bench example: + python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 + + range_bench example: + python3 benchmarks/kernels/benchmark_lora.py range_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 + """, # noqa: E501 + formatter_class=argparse.RawTextHelpFormatter) + + subparsers = parser.add_subparsers(dest="cmd", required=True) + + list_parser = subparsers.add_parser("list_bench") + list_parser.add_argument("--hidden-sizes", + nargs="+", + type=int, + default=DEFAULT_HIDDEN_SIZES) + list_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(list_parser) + list_parser.set_defaults(func=run_list_bench) + + range_parser = subparsers.add_parser("range_bench") + range_parser.add_argument("--hidden-sizes-start", type=int, required=True) + range_parser.add_argument("--hidden-sizes-end", type=int, required=True) + range_parser.add_argument("--hidden-sizes-increment", + type=int, + required=True) + range_parser.add_argument("--lora-ranks-start", type=int, required=True) + range_parser.add_argument("--lora-ranks-end", type=int, required=True) + range_parser.add_argument("--lora-ranks-increment", + type=int, + required=True) + add_common_command_args(range_parser) + range_parser.set_defaults(func=run_range_bench) + + model_parser = subparsers.add_parser("model_bench") + model_parser.add_argument("--models", + nargs="+", + type=str, + default=DEFAULT_MODELS, + choices=WEIGHT_SHAPES.keys()) + model_parser.add_argument("--tp-sizes", + nargs="+", + type=int, + default=DEFAULT_TP_SIZES) + model_parser.add_argument("--lora-ranks", + nargs="+", + type=int, + default=DEFAULT_LORA_RANKS) + add_common_command_args(model_parser) + model_parser.set_defaults(func=run_model_bench) + + args = parser.parse_args() + args.func(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 8f538c21f7f7e..1d59a01422412 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,6 +1,7 @@ import argparse import time from datetime import datetime +from itertools import product from typing import Any, Dict, List, Tuple, TypedDict import ray @@ -11,7 +12,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, is_navi + +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( +) and not is_navi() else torch.float8_e4m3fn class BenchmarkConfig(TypedDict): @@ -80,8 +84,8 @@ def benchmark_config( a1_scale = torch.randn(1, dtype=torch.float32) a2_scale = torch.randn(1, dtype=torch.float32) - w1 = w1.to(torch.float8_e4m3fn) - w2 = w2.to(torch.float8_e4m3fn) + w1 = w1.to(FP8_DTYPE) + w2 = w2.to(FP8_DTYPE) input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32) @@ -141,28 +145,172 @@ def run(): return avg -def get_configs_compute_bound() -> List[Dict[str, int]]: - # Reduced search space for faster tuning. - # TODO(woosuk): Increase the search space and use a performance model to - # prune the search space. +def get_rocm_tuning_space(use_fp16): + block_mn_range = [16, 32, 64, 128, 256] + block_k_range = [16, 32, 64, 128, 256] + if not use_fp16: + block_k_range.remove(16) # BLOCK_K=16 not supported for fp8 + num_warps_range = [1, 2, 4, 8] + group_m_range = [1, 4, 8, 16, 32] + num_stage_range = [2] + waves_per_eu_range = [0] + matrix_instr_nonkdim_range = [16, 32] if use_fp16 else [] + kpack_range = [1, 2] if use_fp16 else [] + + param_ranges = { + "BLOCK_SIZE_M": block_mn_range, + "BLOCK_SIZE_N": block_mn_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + "waves_per_eu": waves_per_eu_range, + } + if use_fp16: + param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range + param_ranges["kpack"] = kpack_range + + return param_ranges + + +def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: configs: List[BenchmarkConfig] = [] - for num_stages in [2, 3, 4, 5]: - for block_m in [16, 32, 64, 128, 256]: - for block_k in [64, 128, 256]: - for block_n in [32, 64, 128, 256]: - for num_warps in [4, 8]: - for group_size in [1, 16, 32, 64]: - configs.append({ - "BLOCK_SIZE_M": block_m, - "BLOCK_SIZE_N": block_n, - "BLOCK_SIZE_K": block_k, - "GROUP_SIZE_M": group_size, - "num_warps": num_warps, - "num_stages": num_stages, - }) + + if current_platform.is_rocm(): + param_ranges = get_rocm_tuning_space(use_fp16) + else: + # Reduced search space for faster tuning. + # TODO(woosuk): Increase the search space and use a performance model to + # prune the search space. + block_m_range = [16, 32, 64, 128, 256] + block_n_range = [32, 64, 128, 256] + block_k_range = [64, 128, 256] + num_warps_range = [4, 8] + group_m_range = [1, 16, 32, 64] + num_stage_range = [2, 3, 4, 5] + + param_ranges = { + "BLOCK_SIZE_M": block_m_range, + "BLOCK_SIZE_N": block_n_range, + "BLOCK_SIZE_K": block_k_range, + "GROUP_SIZE_M": group_m_range, + "num_warps": num_warps_range, + "num_stages": num_stage_range, + } + + keys, values = zip(*param_ranges.items()) + for config_values in product(*values): + config = dict(zip(keys, config_values)) + configs.append(config) return configs +def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, + search_space, is_fp16): + N1, K1 = shard_intermediate_size, hidden_size + N2, K2 = hidden_size, shard_intermediate_size // 2 + pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + is_fp16) + search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) + return search_space + + +# The following code is inspired by ROCm/Triton GEMM tuning script: +# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89 +def prune_rocm_configs(M, N, K, configs, is_fp16=True): + pruned_configs = [] + elemBytes_a = 2 if is_fp16 else 1 + elemBytes_b = 2 if is_fp16 else 1 + + mfma = 16 if M < 32 or N < 32 else 32 + + # TODO (zhanglx): figure out the boundary between large and small gemms + large_gemm = False + if M >= 2048 and N >= 2048: + large_gemm = True + + for config in configs: + BLOCK_SIZE_M = config.get("BLOCK_SIZE_M") + BLOCK_SIZE_N = config.get("BLOCK_SIZE_N") + BLOCK_SIZE_K = config.get("BLOCK_SIZE_K") + num_warps = config.get("num_warps") + + if is_fp16: + matrix_instr_nonkdim = config.get("matrix_instr_nonkdim") + if matrix_instr_nonkdim > mfma: + continue + if mfma == 4 and BLOCK_SIZE_K < 64: + continue + # some layouts could not work properly in case + # number elements per thread is less 1 + if BLOCK_SIZE_M * BLOCK_SIZE_N < 64: + continue + SPLIT_K = config.get("SPLIT_K", 1) + GROUP_M = config.get("GROUP_SIZE_M") + if is_fp16: + if (matrix_instr_nonkdim > BLOCK_SIZE_M + or matrix_instr_nonkdim > BLOCK_SIZE_N): + continue + if (matrix_instr_nonkdim >= M + and matrix_instr_nonkdim != BLOCK_SIZE_M): + continue + if (matrix_instr_nonkdim >= N + and matrix_instr_nonkdim != BLOCK_SIZE_N): + continue + # Skip BLOCK_SIZE that is too large compare to M/N + # unless BLOCK_SIZE is already small enough + if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16: + continue + if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16: + continue + # skip large split_k when not necessary + if SPLIT_K != 1 and not need_split_k(M, N, K): + continue + # skip split_k that leads to EVEN_K = false + leap = SPLIT_K * BLOCK_SIZE_K + modv = K % leap + if modv != 0: + continue + # skip large GROUP_M + if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1: + continue + # out of shared memory resource + # TODO (zhanglx): This does not consider the LDS usage in the epilogue + LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a + + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b) + if LDS > 65536: + continue + # Skip small block sizes and num_warps for large gemm + # For fp16 and f8, we want to only use BLOCK_SIZE >= 64 + if large_gemm: + if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64: + continue + if BLOCK_SIZE_K < 64: + continue + if num_warps < 4: + continue + + pruned_configs.append(config) + + return pruned_configs + + +def need_split_k(SIZE_M, SIZE_N, SIZE_K): + return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024 + + +def merge_unique_dicts(list1, list2): + result = [] + combined_list = list1.copy() + combined_list.extend(list2) + for dictionary in combined_list: + if dictionary not in result: + result.append(dictionary) + return result + + @ray.remote(num_gpus=1) class BenchmarkWorker: @@ -170,6 +318,10 @@ def __init__(self, seed: int) -> None: torch.set_default_device("cuda") current_platform.seed_everything(seed) self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) def benchmark( self, @@ -217,25 +369,33 @@ def tune( ) -> Dict[str, int]: best_config = None best_time = float("inf") - for config in tqdm(search_space): - try: - kernel_time = benchmark_config(config, - num_tokens, - num_experts, - shard_intermediate_size, - hidden_size, - topk, - dtype, - use_fp8_w8a8, - use_int8_w8a16, - num_iters=10) - except triton.runtime.autotuner.OutOfResources: - # Some configurations may be invalid and fail to compile. - continue - - if kernel_time < best_time: - best_time = kernel_time - best_config = config + if current_platform.is_rocm(): + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = prune_rocm_search_space(num_tokens, + shard_intermediate_size, + hidden_size, search_space, + is_fp16) + + with torch.cuda.device(self.device_id): + for config in tqdm(search_space): + try: + kernel_time = benchmark_config(config, + num_tokens, + num_experts, + shard_intermediate_size, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=20) + except triton.runtime.autotuner.OutOfResources: + # Some configurations may be invalid and fail to compile. + continue + + if kernel_time < best_time: + best_time = kernel_time + best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") assert best_config is not None @@ -244,12 +404,27 @@ def tune( def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { - "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], - "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], - "BLOCK_SIZE_K": config["BLOCK_SIZE_K"], - "GROUP_SIZE_M": config["GROUP_SIZE_M"], - "num_warps": config["num_warps"], - "num_stages": config["num_stages"], + "BLOCK_SIZE_M": + config["BLOCK_SIZE_M"], + "BLOCK_SIZE_N": + config["BLOCK_SIZE_N"], + "BLOCK_SIZE_K": + config["BLOCK_SIZE_K"], + "GROUP_SIZE_M": + config["GROUP_SIZE_M"], + "num_warps": + config["num_warps"], + "num_stages": + config["num_stages"], + **({ + "waves_per_eu": config["waves_per_eu"] + } if "waves_per_eu" in config else {}), + **({ + "matrix_instr_nonkdim": config["matrix_instr_nonkdim"] + } if "matrix_instr_nonkdim" in config else {}), + **({ + "kpack": config["kpack"] + } if "kpack" in config else {}), } @@ -294,7 +469,7 @@ def main(args: argparse.Namespace): shard_intermediate_size = 2 * intermediate_size // args.tp_size hidden_size = config.hidden_size - dtype = config.torch_dtype + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_int8_w8a16 = args.dtype == "int8_w8a16" @@ -322,7 +497,8 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: return ray.get(outputs) if args.tune: - search_space = get_configs_compute_bound() + is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16) + search_space = get_configs_compute_bound(is_fp16) print(f"Start tuning over {len(search_space)} configurations...") start = time.time() diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py new file mode 100644 index 0000000000000..fee877b6f76fa --- /dev/null +++ b/benchmarks/kernels/utils.py @@ -0,0 +1,210 @@ +import dataclasses +from typing import Any, Callable, Iterable, Optional + +import torch +import torch.utils.benchmark as TBenchmark +from torch.utils.benchmark import Measurement as TMeasurement + + +@dataclasses.dataclass +class CudaGraphBenchParams: + num_ops_in_cuda_graph: int + + +@dataclasses.dataclass +class ArgPool: + """ + When some argument of the benchmarking function is annotated with this type, + the benchmarking class (BenchMM) will collapse the argument to a pick a + single value from the given list of values, during function invocation. + For every invocation during a benchmarking run, it will choose a + different value from the list. + """ + values: Iterable[Any] + + def __getitem__(self, index): + return self.values[index] + + +class Bench: + + class ArgsIterator: + + def __init__(self, args_list, kwargs_list): + assert len(args_list) == len(kwargs_list) + self.args_list = args_list + self.kwargs_list = kwargs_list + self.n = len(self.args_list) + self.idx = 0 + + def __next__(self): + while True: + yield (self.args_list[self.idx], self.kwargs_list[self.idx]) + self.idx += 1 + self.idx = self.idx % self.n + + def reset(self): + self.idx = 0 + + @property + def n_args(self): + return self.n + + def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams], + label: str, sub_label: str, description: str, fn: Callable, + *args, **kwargs): + + self.cuda_graph_params = cuda_graph_params + self.use_cuda_graph = self.cuda_graph_params is not None + self.label = label + self.sub_label = sub_label + self.description = description + self.fn = fn + + # Process args + self._args = args + self._kwargs = kwargs + self.args_list, self.kwargs_list = self.collapse_argpool( + *args, **kwargs) + self.args_iterator = self.ArgsIterator(self.args_list, + self.kwargs_list) + + # Cudagraph runner + self.g = None + if self.use_cuda_graph: + self.g = self.get_cuda_graph_runner() + + # benchmark run params + self.min_run_time = 1 + + def collapse_argpool(self, *args, **kwargs): + argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [ + arg for arg in kwargs.values() if isinstance(arg, ArgPool) + ] + if len(argpool_args) == 0: + return [args], [kwargs] + + # Make sure all argpools are of the same size + argpool_size = len(argpool_args[0].values) + assert all([argpool_size == len(arg.values) for arg in argpool_args]) + + # create copies of the args + args_list = [] + kwargs_list = [] + for _ in range(argpool_size): + args_list.append(args) + kwargs_list.append(kwargs.copy()) + + for i in range(argpool_size): + # collapse args; Just pick the ith value + args_list[i] = tuple([ + arg[i] if isinstance(arg, ArgPool) else arg + for arg in args_list[i] + ]) + + # collapse kwargs + kwargs_i = kwargs_list[i] + arg_pool_keys = [ + k for k, v in kwargs_i.items() if isinstance(v, ArgPool) + ] + for k in arg_pool_keys: + # again just pick the ith value + kwargs_i[k] = kwargs_i[k][i] + kwargs_list[i] = kwargs_i + + return args_list, kwargs_list + + def get_cuda_graph_runner(self): + assert self.use_cuda_graph + assert self.args_iterator is not None + + num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph + + # warmup + args_it = self.args_iterator.__next__() + for _ in range(2): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + for _ in range(num_graph_ops): + args, kwargs = next(args_it) + self.fn(*args, **kwargs) + return g + + def run_cudagrah(self) -> TMeasurement: + assert self.use_cuda_graph + globals = {'g': self.g} + + return TBenchmark.Timer( + stmt="g.replay()", + globals=globals, + label=( + f"{self.label}" + f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops" + ), + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run_eager(self) -> TMeasurement: + setup = None + stmt = None + globals = None + + has_arg_pool = self.args_iterator.n_args > 1 + if has_arg_pool: + setup = ''' + args_iterator.reset() + args_it = args_iterator.__next__() + ''' + stmt = ''' + args, kwargs = next(args_it) + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args_iterator': self.args_iterator} + else: + # no arg pool. Just use the args and kwargs directly + self.args_iterator.reset() + args_it = self.args_iterator.__next__() + args, kwargs = next(args_it) + + setup = "" + stmt = ''' + fn(*args, **kwargs) + ''' + globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs} + + return TBenchmark.Timer( + stmt=stmt, + setup=setup, + globals=globals, + label=self.label, + sub_label=self.sub_label, + description=self.description, + ).blocked_autorange(min_run_time=self.min_run_time) + + def run(self) -> TMeasurement: + timer = None + if self.use_cuda_graph: # noqa SIM108 + timer = self.run_cudagrah() + else: + timer = self.run_eager() + if not timer.meets_confidence() or timer.has_warnings: + print("Doesn't meet confidence - re-running bench ...") + return self.run() + return timer + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_type: + print(f"exc type {exc_type}") + print(f"exc value {exc_value}") + print(f"exc traceback {traceback}") diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 408e736d5bc0f..c2ae554c9f8e8 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -32,7 +32,7 @@ class ScalarType { signed_(signed_), bias(bias), finite_values_only(finite_values_only), - nan_repr(nan_repr){}; + nan_repr(nan_repr) {}; static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 28db0479748bf..a71815106133a 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,13 +2,13 @@ #define CPU_TYPES_HPP #if defined(__x86_64__) - //x86 implementation + // x86 implementation #include "cpu_types_x86.hpp" #elif defined(__POWER9_VECTOR__) - //ppc implementation + // ppc implementation #include "cpu_types_vsx.hpp" #elif defined(__aarch64__) - //arm implementation + // arm implementation #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index ae062a5b86892..990e99f2fc069 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -1,48 +1,50 @@ #include -#include +#include #include namespace vec_op { #ifdef ARM_BF16_SUPPORT - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #else - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #endif -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { - template - constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); - }; -}; +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; }; @@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec { float16x8_t reg; - explicit FP16Vec8(const void *ptr) - : reg(vld1q_f16(static_cast(ptr))) {}; + explicit FP16Vec8(const void* ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { - vst1q_f16(static_cast<__fp16 *>(ptr), reg); - } + void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); } }; struct FP16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - float16x8x2_t reg; - - explicit FP16Vec16(const void *ptr) { - reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); - reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); - } - - explicit FP16Vec16(const FP32Vec16& vec); - - void save(void *ptr) const { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + constexpr static int VEC_ELEM_NUM = 16; + + float16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void* ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void* ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } } - - void save(void *ptr, const int elem_num) const { - int full_blocks = elem_num / 8; - int remainder = elem_num % 8; - - if (full_blocks > 0) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - if (full_blocks > 1) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - } - - // Note: below is the unrolled version of the following code: - // - // for (int i = 0; i < remainder; ++i) { - // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = - // vgetq_lane_f16(temp, i); - // } - // - // For macOS build (Clang), the arm/neon intrinsics function - // `vgetq_lane_f16` needs the parameter `i` to be constant at compile - // time. - - if (remainder > 0) { - float16x8_t temp = reg.val[full_blocks]; - __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); - switch (remainder) - { - case 1: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - break; - case 2: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - break; - case 3: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - break; - case 4: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - break; - case 5: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - break; - case 6: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - break; - case 7: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); - break; - - default: - break; - } - } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; + } } + } }; - #ifdef ARM_BF16_SUPPORT struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; bfloat16x8_t reg; - explicit BF16Vec8(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + explicit BF16Vec8(float32x4x2_t v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec { bfloat16x8x2_t reg; - explicit BF16Vec16(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - explicit BF16Vec16(float32x4x4_t v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) - }){}; + explicit BF16Vec16(float32x4x4_t v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; struct BF16Vec32 : public Vec { @@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec { bfloat16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {}; + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; #endif @@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; - explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {}; explicit FP32Vec4(float32x4_t data) : reg(data) {}; - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; }; struct FP32Vec8 : public Vec { @@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; - explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + explicit FP32Vec8(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; - explicit FP32Vec8(const FP16Vec8 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); - }; + explicit FP32Vec8(const FP16Vec8& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; - explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + explicit FP32Vec8(float16x8_t v) + : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; - #ifdef ARM_BF16_SUPPORT +#ifdef ARM_BF16_SUPPORT - explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + explicit FP32Vec8(bfloat16x8_t v) + : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; - explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + explicit FP32Vec8(const BF16Vec8& v) + : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; - #endif +#endif float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; } @@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; - float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; - float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; - float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; - float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), + static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), + static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), + static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), + static_cast(erf(ar.values[7]))}; float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); @@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec { result.val[1] = result1; return FP32Vec8(result); - } + } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1])})); } - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); } @@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec { float32x4x4_t reg; - explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + explicit FP32Vec16(float v) + : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} - explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + explicit FP32Vec16() + : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), + vmovq_n_f32(0.0)}) {} - explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + explicit FP32Vec16(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), + vld1q_f32(ptr + 12)}) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec8 &data) { - reg.val[0] = data.reg.val[0]; - reg.val[1] = data.reg.val[1]; - reg.val[2] = data.reg.val[0]; - reg.val[3] = data.reg.val[1]; + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {} - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(bfloat16x8x2_t v) : reg({ - vcvtq_low_f32_bf16(v.val[0]), - vcvtq_high_f32_bf16(v.val[0]), - vcvtq_low_f32_bf16(v.val[1]), - vcvtq_high_f32_bf16(v.val[1]) - }) {}; - #endif +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) + : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {}; +#endif - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; }; - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(const BF16Vec16 &v) : reg({ - vcvtq_low_f32_bf16(v.reg.val[0]), - vcvtq_high_f32_bf16(v.reg.val[0]), - vcvtq_low_f32_bf16(v.reg.val[1]), - vcvtq_high_f32_bf16(v.reg.val[1]) - }) {}; - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; - #endif - - explicit FP32Vec16(const FP16Vec16 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); - reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); - reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16& v) + : reg({vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1])}) {}; + + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; +#endif + + explicit FP32Vec16(const FP16Vec16& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); }; - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vaddq_f32(reg.val[0], b.reg.val[0]), - vaddq_f32(reg.val[1], b.reg.val[1]), - vaddq_f32(reg.val[2], b.reg.val[2]), - vaddq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vmulq_f32(reg.val[0], b.reg.val[0]), - vmulq_f32(reg.val[1], b.reg.val[1]), - vmulq_f32(reg.val[2], b.reg.val[2]), - vmulq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vsubq_f32(reg.val[0], b.reg.val[0]), - vsubq_f32(reg.val[1], b.reg.val[1]), - vsubq_f32(reg.val[2], b.reg.val[2]), - vsubq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vdivq_f32(reg.val[0], b.reg.val[0]), - vdivq_f32(reg.val[1], b.reg.val[1]), - vdivq_f32(reg.val[2], b.reg.val[2]), - vdivq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3])})); }; float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; }; - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec { return answer; }; - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); vst1q_f32(ptr + 8, reg.val[2]); @@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec { }; }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; #ifdef ARM_BF16_SUPPORT -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; #endif -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<__fp16 *>(ptr) = v; +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast<__fp16*>(ptr) = v; } -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { - float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); - float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); - float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); - float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); - reg.val[0] = vcombine_f16(low_0, high_0); - reg.val[1] = vcombine_f16(low_1, high_1); + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); }; -inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { - float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); - float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); - reg = vcombine_f16(lower_half, upper_half); + reg = vcombine_f16(lower_half, upper_half); }; -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); @@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { }; #ifdef ARM_BF16_SUPPORT -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); @@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { #endif #ifdef ARM_BF16_SUPPORT -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) - }){}; +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) { + }; + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), + v.reg.val[3])}) {}; #endif -inline void prefetch(const void *addr) { - __builtin_prefetch(addr, 0, 1); -}; +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }; #ifdef ARM_BF16_SUPPORT template <> -inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v); }; #endif -}; \ No newline at end of file +}; // namespace vec_op \ No newline at end of file diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index b50bdadc5713d..a8e1be37eb418 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -9,38 +9,40 @@ namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec { __vector signed short reg; - explicit BF16Vec8(const void *ptr) - : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } }; struct BF16Vec16 : public Vec { @@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec { ss16x8x2_t reg; - explicit BF16Vec16(const void *ptr) { + explicit BF16Vec16(const void* ptr) { // Load 256 bits in two parts - reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); - reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { + void save(void* ptr) const { // Save 256 bits in two parts - vec_xst(reg.val[0], 0, (signed short *)ptr); - vec_xst(reg.val[1], 16, (signed short *)ptr); + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); } }; @@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; ss16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {} + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {} + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct FP32Vec4 : public Vec { @@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vec_splats(0.0f)) {} - explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(__vector float data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec { reg.val[1] = vec_splats(0.0f); } - explicit FP32Vec8(const float *ptr) { + explicit FP32Vec8(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); } explicit FP32Vec8(f32x4x2_t data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) { + explicit FP32Vec8(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; } - explicit FP32Vec8(const BF16Vec8 &v) { + explicit FP32Vec8(const BF16Vec8& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg); } @@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec { return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); } @@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_splats(0.0f); } - explicit FP32Vec16(const float *ptr) { + explicit FP32Vec16(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); reg.val[2] = vec_xl(32, ptr); @@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) { + explicit FP32Vec16(const FP32Vec16& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[2]; reg.val[3] = data.reg.val[3]; } - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; } - explicit FP32Vec16(const FP32Vec8 &data) { + explicit FP32Vec16(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[0]; reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_mul(reg.val[0], b.reg.val[0]), - vec_mul(reg.val[1], b.reg.val[1]), - vec_mul(reg.val[2], b.reg.val[2]), - vec_mul(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_add(reg.val[0], b.reg.val[0]), - vec_add(reg.val[1], b.reg.val[1]), - vec_add(reg.val[2], b.reg.val[2]), - vec_add(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_sub(reg.val[0], b.reg.val[0]), - vec_sub(reg.val[1], b.reg.val[1]), - vec_sub(reg.val[2], b.reg.val[2]), - vec_sub(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_div(reg.val[0], b.reg.val[0]), - vec_div(reg.val[1], b.reg.val[1]), - vec_div(reg.val[2], b.reg.val[2]), - vec_div(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); } float reduce_sum() const { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec { return result; } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[2], 32, ptr); @@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec { } }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } #ifndef __VEC_CLASS_FP_NAN -#define __VEC_CLASS_FP_NAN (1 << 6) + #define __VEC_CLASS_FP_NAN (1 << 6) #endif -const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; #ifndef _ARCH_PWR10 -const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; -const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; -const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; -const static __vector unsigned int one = { 1, 1, 1, 1 }; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; #endif -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #ifdef _ARCH_PWR10 __vector signed short ret[2]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); reg = vec_perm(ret[0], ret[1], omask); #elif defined(_ARCH_PWR9) __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); @@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { __vector unsigned int rnd1 = vec_add(lsb1, bias); inp0 = vec_add(inp0, rnd0); inp1 = vec_add(inp1, rnd1); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp0 = vec_sr(inp0, sh16); @@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { #endif } -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); - ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); - ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[3]); reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask); #elif defined(_ARCH_PWR9) @@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inp1 = vec_add(inp1, rnd1); inp2 = vec_add(inp2, rnd2); inp3 = vec_add(inp3, rnd3); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); - __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); - __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = + vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = + vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp2 = vec_sel(inp2, nan, sel2); @@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { #endif } -inline void prefetch(const void *addr) { +inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4bb4eb0f491ac..a4ef2be2a58ca 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - RECORD_FUNCTION(#NAME, c10::ArrayRef({})); -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) \ + RECORD_FUNCTION(#NAME, c10::ArrayRef({})); + #define CPU_KERNEL_GUARD_OUT(NAME) #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec { __m128i reg; - explicit FP16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit FP16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct FP16Vec16 : public Vec { @@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec { __m256i reg; - explicit FP16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit FP16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit FP16Vec16(const FP32Vec16 &); + explicit FP16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec { __m128i reg; - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec { __m256i reg; - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit BF16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec { __m512i reg; - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg((__m512i)_mm512_inserti32x4( _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( (__m128i)vec8_data.reg), @@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 2), (__m128i)vec8_data.reg, 3)) {} - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; } }; #else struct BF16Vec32 : public Vec { @@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec { __m256i reg_low; __m256i reg_high; - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + explicit BF16Vec32(const void* ptr) + : reg_low(_mm256_loadu_si256((__m256i const*)ptr)), + reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {} - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} + explicit BF16Vec32(__m256i low, __m256i high) + : reg_low(low), reg_high(high) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + void save(void* ptr) const { + *reinterpret_cast<__m256i*>(ptr) = reg_low; + *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; } }; #endif @@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {} explicit FP32Vec4(__m128 data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {} explicit FP32Vec8(__m256 data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {} - explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} + explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {} - explicit FP32Vec8(const BF16Vec8 &v) + explicit FP32Vec8(const BF16Vec8& v) : reg(_mm256_castsi256_ps( _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} @@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec { erf(ar.values[1]), erf(ar.values[0]))); } - FP32Vec8 operator*(const FP32Vec8 &b) const { + FP32Vec8 operator*(const FP32Vec8& b) const { return FP32Vec8(_mm256_mul_ps(reg, b.reg)); } - FP32Vec8 operator+(const FP32Vec8 &b) const { + FP32Vec8 operator+(const FP32Vec8& b) const { return FP32Vec8(_mm256_add_ps(reg, b.reg)); } - FP32Vec8 operator-(const FP32Vec8 &b) const { + FP32Vec8 operator-(const FP32Vec8& b) const { return FP32Vec8(_mm256_sub_ps(reg, b.reg)); } - FP32Vec8 operator/(const FP32Vec8 &b) const { + FP32Vec8 operator/(const FP32Vec8& b) const { return FP32Vec8(_mm256_div_ps(reg, b.reg)); } - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); } }; #ifdef __AVX512F__ -struct INT32Vec16: public Vec { +struct INT32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m512i reg; @@ -272,12 +274,11 @@ struct INT32Vec16: public Vec { }; __m512i reg; - - explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} - void save(int32_t* ptr) const { - _mm512_storeu_epi32(ptr, reg); - } + explicit INT32Vec16(const void* data_ptr) + : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); } void save(int32_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), @@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec { (__m128i)data.reg, 2), (__m128i)data.reg, 3)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg((__m512)_mm512_inserti32x8( _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - explicit FP32Vec16(const BF16Vec16 &v) + explicit FP32Vec16(const BF16Vec16& v) : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const INT32Vec16 &v) - : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + explicit FP32Vec16(const INT32Vec16& v) + : reg(_mm512_cvt_roundepi32_ps( + v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm512_add_ps(reg, b.reg)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } @@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); } - FP32Vec16 abs() const { - return FP32Vec16(_mm512_abs_ps(reg)); - } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); } @@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec { float reduce_min() const { return _mm512_reduce_min_ps(reg); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); return _mm512_mask_reduce_add_ps(mask, reg); } - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec { __m256 reg_low; __m256 reg_high; - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} + explicit FP32Vec16(float v) + : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {} - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} + explicit FP32Vec16() + : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} + explicit FP32Vec16(const float* ptr) + : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {} explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} + explicit FP32Vec16(const FP32Vec16& data) + : reg_low(data.reg_low), reg_high(data.reg_high) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)), reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg_low(data.reg), reg_high(data.reg) {} - explicit FP32Vec16(const FP16Vec16 &v) { + explicit FP32Vec16(const FP16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_cvtph_ps(high); } - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_castsi256_ps(v_high_shifted); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), _mm256_mul_ps(reg_high, b.reg_high)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), _mm256_add_ps(reg_high, b.reg_high)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), _mm256_sub_ps(reg_high, b.reg_high)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), _mm256_div_ps(reg_high, b.reg_high)); } @@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec { return low.reduce_sum() + high.reduce_sum(); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { float sum = 0.0; static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); @@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec { return sum; } - void save(float *ptr) const { + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg_low); _mm256_storeu_ps(ptr + 8, reg_high); } @@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec { #endif #ifdef __AVX512F__ -struct INT8Vec16: public Vec { +struct INT8Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m128i reg; @@ -523,14 +523,12 @@ struct INT8Vec16: public Vec { }; __m128i reg; - - explicit INT8Vec16(const FP32Vec16& vec) : reg( - _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) - ) {} - void save(int8_t* ptr) const { - _mm_storeu_epi8(ptr, reg); - } + explicit INT8Vec16(const FP32Vec16& vec) + : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32( + vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {} + + void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); } void save(int8_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -540,71 +538,92 @@ struct INT8Vec16: public Vec { }; #endif -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast(ptr) = +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast(ptr) = _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) : reg(_mm256_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #ifdef __AVX512F__ -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) : reg(_mm512_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #else -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) - : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) + : reg(_mm256_insertf128_si256( + _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), + FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} #endif #ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v); } -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); } #else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + #ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ + #else +namespace { __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { __m256i ai = _mm256_castps_si256(a); ai = _mm256_srli_epi32(ai, 16); @@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { ai = _mm256_permute4x64_epi64(ai, 0b00111001); return _mm256_extracti128_si256(ai, 0); } -} +} // namespace -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); } -#endif // __AVX512F__ -#endif // __AVX512BF16__ + #endif // __AVX512F__ +#endif // __AVX512BF16__ -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } +inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 85e359aa57113..07c9e46c27b06 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -27,8 +27,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, - cudaDevAttrMaxSharedMemoryPerBlockOptin, - device); + cudaDevAttrMaxSharedMemoryPerBlockOptin, device); return max_shared_mem_per_block_opt_in; } diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index e92104399342d..36cf8e7440eca 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -25,10 +25,12 @@ Check out the [building from source](#build-from-source) documentation for detai ```bash pip install -r requirements-dev.txt -# linting and formatting -bash format.sh -# Static type checking -mypy +# Linting, formatting and static type checking +pre-commit install + +# You can manually run pre-commit with +pre-commit run --all-files + # Unit tests pytest tests/ ``` @@ -88,7 +90,8 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use `pre-commit` to format your code. See + if `pre-commit` is new to you. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 2606e2765c1ae..438be47316f3b 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -42,6 +42,9 @@ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. + +If you are using Podman instead of Docker, you might need to disable SELinux labeling by +adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). ``` ## Building for Arm64/aarch64 diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index f6f9d3c303f89..4256027e6c40e 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -70,7 +70,7 @@ Currently, there are no pre-built ROCm wheels. # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2 # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 85d844f3d3f55..3da5aaf713c1f 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -470,6 +470,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding - `Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +* - `Qwen2ForProcessRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using @@ -618,7 +623,7 @@ See [this page](#generative-models) for more information on how to use generativ * - `DeepseekVLV2ForCausalLM` - DeepSeek-VL2 - T + I+ - - `deepseek-ai/deepseek-vl2-tiny`(WIP), `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) - - ✅︎ - ✅︎ @@ -754,7 +759,7 @@ See [this page](#generative-models) for more information on how to use generativ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - ✅︎ * - `UltravoxModel` - Ultravox - T + AE+ @@ -767,17 +772,10 @@ See [this page](#generative-models) for more information on how to use generativ E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{note} -The `deepseek-ai/deepseek-vl2-tiny` is not supported yet. - -To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package: -```shell -pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git +```{note} +To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. ``` -Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. -```` - ```{note} To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py new file mode 100644 index 0000000000000..3bc303dad277f --- /dev/null +++ b/examples/offline_inference/rlhf.py @@ -0,0 +1,191 @@ +""" +a simple demonstration of RLHF with vLLM, inspired by +the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . +It follows the design that, training processes and inference processes +are different, and they live on different GPUs. +Training processes send prompts to inference processes to generate data, +and also synchronize the weights of the model by broadcasting the weights +from the training process to the inference process. +Note that this is a simple demonstration of one training instance and one +inference instance. In practice, there could be multiple training instances +and multiple inference instances. For the full implementation, please refer +to the OpenRLHF framework. +""" +import os + +import ray +import torch +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from transformers import AutoModelForCausalLM + +from vllm import LLM, SamplingParams, configure_as_vllm_process +from vllm.utils import get_ip, get_open_port +from vllm.worker.worker import Worker + + +def stateless_init_process_group(master_address, master_port, rank, world_size, + device): + """ + vLLM provides `StatelessProcessGroup` to create a process group + without considering the global process group in torch.distributed. + It is recommended to create `StatelessProcessGroup`, and then initialize + the data-plane communication (NCCL) between external (train processes) + and vLLM workers. + """ + from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + from vllm.distributed.utils import StatelessProcessGroup + pg = StatelessProcessGroup.create(host=master_address, + port=master_port, + rank=rank, + world_size=world_size) + pynccl = PyNcclCommunicator(pg, device=device) + return pynccl + + +class MyWorker(Worker): + """ + The `MyWorker` class inherits from `Worker` to provide custom functions. + For simplicity, we define the `MyWorker` class in this self-contained + script. Normally, we should define the `MyWorker` class in a separate + file and pass the qualified name of the class to the `worker_cls` + parameter. + """ + + def init_weight_update_group(self, master_address, master_port, + rank_offset, world_size): + from vllm.distributed.parallel_state import get_world_group + rank = get_world_group().rank + rank_offset + self.model_update_group = stateless_init_process_group( + master_address, + master_port, + rank, + world_size, + self.device, + ) + + def update_weight(self, name, dtype, shape): + weight = torch.empty(shape, dtype=dtype, device="cuda") + self.model_update_group.broadcast(weight, + src=0, + stream=torch.cuda.current_stream()) + + self.model_runner.model.load_weights(weights=[(name, weight)]) + + del weight + + def check_weights_changed(self): + """ + Check if the weights are updated to 0. + """ + weights_updated = True + for name, p in self.model_runner.model.named_parameters(): + weights_updated = weights_updated and torch.allclose( + p, torch.zeros_like(p)) + return weights_updated + + +class MyLLM(LLM): + + def __init__(self, *args, **kwargs): + # a hack to make the script work. + # stop ray from manipulating CUDA_VISIBLE_DEVICES + # at the top-level + del os.environ["CUDA_VISIBLE_DEVICES"] + super().__init__(*args, **kwargs) + + +""" +Start the training process, here we use huggingface transformers +as an example to hold a model on GPU 0. + +It is important for all the processes outside of vLLM to call +`configure_as_vllm_process` to set some common environment variables +the same as vLLM workers. +""" +configure_as_vllm_process() + +train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") +train_model.to("cuda:0") +""" +Start the inference process, here we use vLLM to hold a model on GPU 1 and +GPU 2. For the details on how to use ray, please refer to the ray +documentation https://docs.ray.io/en/latest/ . +""" +os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" +ray.init() + +pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2) +ray.get(pg_inference.ready()) +scheduling_inference = PlacementGroupSchedulingStrategy( + placement_group=pg_inference, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=0, +) +""" +launch the vLLM inference engine. +here we use `enforce_eager` to reduce the start time. +""" +llm = ray.remote( + num_cpus=0, + num_gpus=0, + scheduling_strategy=scheduling_inference, +)(MyLLM).remote( + model="facebook/opt-125m", + enforce_eager=True, + worker_cls=MyWorker, + tensor_parallel_size=2, + distributed_executor_backend="ray", +) + +# Generate texts from the prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0) + +outputs = ray.get(llm.generate.remote(prompts, sampling_params)) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +# set up the communication between the training process +# and the inference engine. +master_address = get_ip() +master_port = get_open_port() + +handle = llm.collective_rpc.remote("init_weight_update_group", + args=(master_address, master_port, 1, 3)) +model_update_group = stateless_init_process_group(master_address, master_port, + 0, 3, torch.device("cuda:0")) +ray.get(handle) + +# simulate training, modify the weights of the model. +for name, p in train_model.named_parameters(): + p.data.zero_() + +# sync weight from the training process to the inference engine. +for name, p in train_model.named_parameters(): + handle = llm.collective_rpc.remote("update_weight", + args=(name, p.dtype, p.shape)) + model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream()) + ray.get(handle) + +# check if the weights are updated. +assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) + +# use the updated model to generate texts, they will be nonsense +# because the weights are all zeros. +outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) +for output in outputs_updated: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py new file mode 100644 index 0000000000000..b6de73eb7266e --- /dev/null +++ b/examples/offline_inference/torchrun_example.py @@ -0,0 +1,64 @@ +""" +experimental support for tensor-parallel inference with torchrun, +see https://github.com/vllm-project/vllm/issues/11400 for +the motivation and use case for this example. +run the script with `torchrun --nproc-per-node=2 torchrun_example.py`, +the argument 2 should match the `tensor_parallel_size` below. +see `tests/distributed/test_torchrun_example.py` for the unit test. +""" + +from vllm import LLM, SamplingParams + +# Create prompts, the same across all ranks +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create sampling parameters, the same across all ranks +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Use `distributed_executor_backend="external_launcher"` so that +# this llm engine/instance only creates one worker. +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", +) + +outputs = llm.generate(prompts, sampling_params) + +# all ranks will have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") +""" +Further tips: + +1. to communicate control messages across all ranks, use the cpu group, +a PyTorch ProcessGroup with GLOO backend. + +```python +from vllm.distributed.parallel_state import get_world_group +cpu_group = get_world_group().cpu_group +torch_rank = dist.get_rank(group=cpu_group) +if torch_rank == 0: + # do something for rank 0, e.g. saving the results to disk. +``` + +2. to communicate data across all ranks, use the model's device group, +a PyTorch ProcessGroup with NCCL backend. +```python +from vllm.distributed.parallel_state import get_world_group +device_group = get_world_group().device_group +``` + +3. to access the model directly in every rank, use the following code: +```python +llm.llm_engine.model_executor.driver_worker.worker.model_runner.model +``` +""" diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index ad32b9fe242e9..69228bbf22949 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -70,7 +70,7 @@ def run_chameleon(question: str, modality: str): def run_deepseek_vl2(question: str, modality: str): assert modality == "image" - model_name = "deepseek-ai/deepseek-vl2-small" + model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, max_model_len=4096, @@ -325,7 +325,6 @@ def run_mllama(question: str, modality: str): model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index c6cf3f30c31cb..43c44fa867e0a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData: def load_deepseek_vl2(question: str, image_urls: List[str]): - model_name = "deepseek-ai/deepseek-vl2-small" + model_name = "deepseek-ai/deepseek-vl2-tiny" llm = LLM(model=model_name, max_model_len=4096, @@ -186,7 +186,6 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: model=model_name, max_model_len=4096, max_num_seqs=16, - enforce_eager=True, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -394,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, - "deepseek_vl2": load_deepseek_vl2, + "deepseek_vl_v2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index 87155273a81d1..2bb2824c6c86f 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -3,6 +3,8 @@ # We will launch 2 vllm instances (1 for prefill and 1 for decode), # and then transfer the KV cache between them. +set -xe + echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" sleep 1 @@ -69,7 +71,7 @@ wait_for_server 8200 # instance # NOTE: the usage of this API is subject to change --- in the future we will # introduce "vllm connect" to connect between prefill and decode instances -python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & sleep 1 # serve two example requests diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja new file mode 100644 index 0000000000000..fbf3d320094d5 --- /dev/null +++ b/examples/template_deepseek_vl2.jinja @@ -0,0 +1,23 @@ +{%- if messages[0]['role'] == 'system' -%} + {%- set system_message = messages[0]['content'] -%} + {%- set messages = messages[1:] -%} +{%- else -%} + {% set system_message = '' -%} +{%- endif -%} + +{{ bos_token + system_message }} +{%- for message in messages -%} + {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {%- endif -%} + + {%- if message['role'] == 'user' -%} + {{ '<|User|>: ' + message['content'] + '\n' }} + {%- elif message['role'] == 'assistant' -%} + {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{ '<|Assistant|>: ' }} +{% endif %} diff --git a/format.sh b/format.sh index 2277eef93c745..4bcd0be0c96e5 100755 --- a/format.sh +++ b/format.sh @@ -1,321 +1,5 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. +#!/bin/bash -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -check_command() { - if ! command -v "$1" &> /dev/null; then - echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" - exit 1 - fi -} - -check_command yapf -check_command ruff -check_command mypy -check_command codespell -check_command isort -check_command clang-format - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') -CODESPELL_VERSION=$(codespell --version) -ISORT_VERSION=$(isort --vn) -CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}') - -# # params: tool name, tool version, required version -tool_version_check() { - expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) - if [[ "$2" != "$expected" ]]; then - echo "❓❓Wrong $1 version installed: $expected is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" "$YAPF_VERSION" -tool_version_check "ruff" "$RUFF_VERSION" -tool_version_check "mypy" "$MYPY_VERSION" -tool_version_check "isort" "$ISORT_VERSION" -tool_version_check "codespell" "$CODESPELL_VERSION" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -echo 'vLLM mypy:' -tools/mypy.sh -echo 'vLLM mypy: Done' - - -# If git diff returns a file that is in the skip list, the file may be checked anyway: -# https://github.com/codespell-project/codespell/issues/1915 -# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem -CODESPELL_EXCLUDES=( - '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' -) - -# check spelling of specified files -spell_check() { - codespell "$@" -} - -spell_check_all(){ - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" -} - -# Spelling check of files that differ from main branch. -spell_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell "${CODESPELL_EXCLUDES[@]}" - fi -} - -# Run Codespell -## This flag runs spell check of individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - spell_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - spell_check_all -else - # Check spelling only of the files that changed in last commit. - spell_check_changed -fi -echo 'vLLM codespell: Done' - - -# Lint specified files -lint() { - ruff check "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff check - fi - -} - -# Run Ruff -### This flag lints individual files. --files *must* be the first command line -### arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi -echo 'vLLM ruff: Done' - -# check spelling of specified files -isort_check() { - isort "$@" -} - -isort_check_all(){ - isort . -} - -# Spelling check of files that differ from main branch. -isort_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - isort - fi -} - -# Run Isort -# This flag runs spell check of individual files. --files *must* be the first command line -# arg to use this option. -if [[ "$1" == '--files' ]]; then - isort_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - isort_check_all -else - # Check spelling only of the files that changed in last commit. - isort_check_changed -fi -echo 'vLLM isort: Done' - -# Clang-format section -# Exclude some files for formatting because they are vendored -# NOTE: Keep up to date with .github/workflows/clang-format.yml -CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' -) - -# Format specified files with clang-format -clang_format() { - clang-format -i "$@" -} - -# Format files that differ from main branch with clang-format. -clang_format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause clang-format to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - # Get the list of changed files, excluding the specified ones - changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) - if [ -n "$changed_files" ]; then - echo "$changed_files" | xargs -P 5 clang-format -i - fi -} - -# Format all files with clang-format -clang_format_all() { - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ - | xargs clang-format -i -} - -# Run clang-format -if [[ "$1" == '--files' ]]; then - clang_format "${@:2}" -elif [[ "$1" == '--all' ]]; then - clang_format_all -else - clang_format_changed -fi -echo 'vLLM clang-format: Done' - -echo 'vLLM actionlint:' -tools/actionlint.sh -color -echo 'vLLM actionlint: Done' - -echo 'vLLM shellcheck:' -tools/shellcheck.sh -echo 'vLLM shellcheck: Done' - -echo 'excalidraw png check:' -tools/png-lint.sh -echo 'excalidraw png check: Done' - -if ! git diff --quiet &>/dev/null; then - echo - echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" - git --no-pager diff --name-only - echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." - - exit 1 -else - echo "✨🎉 Format check passed! Congratulations! 🎉✨" -fi - -echo 'vLLM doc-lint:' -tools/doc-lint.sh -echo 'vLLM doc-lint: Done' +echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook." +echo "Then linters will run automatically before each commit." diff --git a/pyproject.toml b/pyproject.toml index 82275ccafb572..8f2e20d0f5800 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 @@ -52,6 +57,9 @@ ignore = [ "B007", # f-string format "UP032", + # Python 3.8 typing + "UP006", "UP035", + ] [tool.mypy] diff --git a/requirements-lint.txt b/requirements-lint.txt index ffc73f90a0d48..62446f94048df 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,15 +1,2 @@ # formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.2 -ruff==0.6.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 -pymarkdownlnt==0.9.26 - -# type checking -mypy==1.11.1 -types-PyYAML -types-requests -types-setuptools +pre-commit==4.0.1 diff --git a/requirements-test.in b/requirements-test.in index 4b4dc376d1fa5..bc76a91ad5356 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -29,4 +29,7 @@ lm-eval[api]==0.4.4 # required for model evaluation test bitsandbytes>=0.45.0 buildkite-test-collector==0.1.9 +genai_perf==0.0.8 +tritonclient==2.51.0 + numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index f576e42afcbbf..09e009c2e21f4 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -37,7 +37,7 @@ audioread==3.0.1 # via librosa awscli==1.35.23 # via -r requirements-test.in -bitsandbytes>=0.45.0 +bitsandbytes==0.45.0 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator @@ -75,6 +75,8 @@ colorama==0.4.6 # tqdm-multiprocess contourpy==1.3.0 # via matplotlib +cramjam==2.9.0 + # via fastparquet cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 @@ -109,6 +111,8 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +fastparquet==2024.11.0 + # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -130,8 +134,11 @@ fsspec[http]==2024.9.0 # via # datasets # evaluate + # fastparquet # huggingface-hub # torch +genai-perf==0.0.8 + # via -r requirements-test.in genson==1.3.0 # via datamodel-code-generator h11==0.14.0 @@ -186,6 +193,8 @@ jsonschema==4.23.0 # ray jsonschema-specifications==2024.10.1 # via jsonschema +kaleido==0.2.1 + # via genai-perf kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 @@ -200,6 +209,8 @@ lm-eval[api]==0.4.4 # via -r requirements-test.in lxml==5.3.0 # via sacrebleu +markdown-it-py==3.0.0 + # via rich markupsafe==3.0.2 # via jinja2 matplotlib==3.9.2 @@ -209,6 +220,8 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mdurl==0.1.2 + # via markdown-it-py mistral-common[opencv]==1.5.1 # via # -r requirements-test.in @@ -249,6 +262,8 @@ numpy==1.26.4 # datasets # decord # evaluate + # fastparquet + # genai-perf # librosa # matplotlib # mistral-common @@ -256,15 +271,18 @@ numpy==1.26.4 # numexpr # opencv-python-headless # pandas + # patsy # peft # rouge-score # sacrebleu # scikit-learn # scipy # soxr + # statsmodels # tensorizer # torchvision # transformers + # tritonclient nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -306,30 +324,39 @@ packaging==24.1 # datamodel-code-generator # datasets # evaluate + # fastparquet # huggingface-hub # lazy-loader # matplotlib # peft + # plotly # pooch # pytest # pytest-rerunfailures # ray + # statsmodels # transformers # typepy pandas==2.2.3 # via # datasets # evaluate + # fastparquet + # genai-perf + # statsmodels pathspec==0.12.1 # via black pathvalidate==3.2.1 # via pytablewriter +patsy==1.0.1 + # via statsmodels peft==0.13.2 # via # -r requirements-test.in # lm-eval pillow==10.4.0 # via + # genai-perf # matplotlib # mistral-common # sentence-transformers @@ -338,6 +365,8 @@ platformdirs==4.3.6 # via # black # pooch +plotly==5.24.1 + # via genai-perf pluggy==1.5.0 # via pytest pooch==1.8.2 @@ -360,7 +389,9 @@ psutil==6.1.0 py==1.11.0 # via pytest-forked pyarrow==18.0.0 - # via datasets + # via + # datasets + # genai-perf pyasn1==0.6.1 # via rsa pybind11==2.13.6 @@ -373,6 +404,8 @@ pydantic[email]==2.9.2 # mistral-common pydantic-core==2.23.4 # via pydantic +pygments==2.18.0 + # via rich pyparsing==3.2.0 # via matplotlib pytablewriter==1.2.0 @@ -381,14 +414,18 @@ pytest==8.3.3 # via # -r requirements-test.in # buildkite-test-collector + # genai-perf # pytest-asyncio # pytest-forked + # pytest-mock # pytest-rerunfailures # pytest-shard pytest-asyncio==0.24.0 # via -r requirements-test.in pytest-forked==1.6.0 # via -r requirements-test.in +pytest-mock==3.14.0 + # via genai-perf pytest-rerunfailures==14.0 # via -r requirements-test.in pytest-shard==0.1.2 @@ -399,6 +436,8 @@ python-dateutil==2.9.0.post0 # matplotlib # pandas # typepy +python-rapidjson==1.20 + # via tritonclient pytz==2024.2 # via # pandas @@ -409,9 +448,11 @@ pyyaml==6.0.2 # awscli # datamodel-code-generator # datasets + # genai-perf # huggingface-hub # peft # ray + # responses # timm # transformers ray[adag]==2.40.0 @@ -438,8 +479,13 @@ requests==2.32.3 # mistral-common # pooch # ray + # responses # tiktoken # transformers +responses==0.25.3 + # via genai-perf +rich==13.9.4 + # via genai-perf rouge-score==0.1.2 # via lm-eval rpds-py==0.20.1 @@ -470,6 +516,7 @@ scipy==1.13.1 # librosa # scikit-learn # sentence-transformers + # statsmodels sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -490,6 +537,8 @@ soxr==0.5.0.post1 # via librosa sqlitedict==2.1.0 # via lm-eval +statsmodels==0.14.4 + # via genai-perf sympy==1.13.1 # via torch tabledata==1.3.3 @@ -499,7 +548,9 @@ tabulate==0.9.0 tcolorpy==0.1.6 # via pytablewriter tenacity==9.0.0 - # via lm-eval + # via + # lm-eval + # plotly tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 @@ -540,6 +591,7 @@ tqdm-multiprocess==0.0.11 # via lm-eval transformers==4.47.0 # via + # genai-perf # lm-eval # peft # sentence-transformers @@ -548,6 +600,10 @@ transformers-stream-generator==0.0.5 # via -r requirements-test.in triton==3.1.0 # via torch +tritonclient==2.51.0 + # via + # -r requirements-test.in + # genai-perf typepy[datetime]==1.3.2 # via # dataproperty @@ -555,6 +611,7 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # bitsandbytes # huggingface-hub # librosa # mistral-common @@ -563,10 +620,12 @@ typing-extensions==4.12.2 # torch tzdata==2024.2 # via pandas -urllib3==1.26.20 +urllib3==2.2.3 # via # botocore # requests + # responses + # tritonclient word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/setup.py b/setup.py index 7dfcec7f9f0c5..978625a069778 100644 --- a/setup.py +++ b/setup.py @@ -472,13 +472,9 @@ def get_gaudi_sw_version(): def get_vllm_version() -> str: - # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 - try: - version = get_version( - write_to="vllm/_version.py", # TODO: move this to pyproject.toml - ) - except LookupError: - version = "0.0.0" + version = get_version( + write_to="vllm/_version.py", # TODO: move this to pyproject.toml + ) sep = "+" if "+" not in version else "." # dev versions might contain + diff --git a/tests/conftest.py b/tests/conftest.py index 95af4ac1eb17b..279c1bf9a3776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -244,6 +244,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -930,6 +931,10 @@ def score( req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py new file mode 100644 index 0000000000000..7aa03d7f0402a --- /dev/null +++ b/tests/distributed/test_torchrun_example.py @@ -0,0 +1,56 @@ +# unit test for `examples/offline_inference/torchrun_example.py` + +import random + +import torch.distributed as dist + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import get_world_group + +# Create prompts +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# set different `gpu_memory_utilization` and `swap_space` for different ranks, +# to test if all ranks agree on the same kv cache configuration. +llm = LLM(model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="external_launcher", + gpu_memory_utilization=random.uniform(0.7, 0.9), + swap_space=random.randint(1, 4)) + +outputs = llm.generate(prompts, sampling_params) + +cpu_group = get_world_group().cpu_group + +torch_rank = dist.get_rank(group=cpu_group) + + +def test_consistent_across_ranks(obj): + if torch_rank == 0: + dist.broadcast_object_list([obj], src=0, group=cpu_group) + else: + container = [None] + dist.broadcast_object_list(container, src=0, group=cpu_group) + assert container[0] == obj + + +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_cpu_blocks) +test_consistent_across_ranks( + llm.llm_engine.vllm_config.cache_config.num_gpu_blocks) + +# all ranks should have the same outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + test_consistent_across_ranks(prompt) + test_consistent_across_ranks(generated_text) + print(f"Rank {torch_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index 2a057ca488a50..0e33f3662da82 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import pytest @@ -18,7 +18,7 @@ class Mock: class CustomUniExecutor(UniProcExecutor): def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: @@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomUniExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index db70a808c008b..04505fcaae24b 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -22,7 +22,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]: # simulate error case raise worker_input - return self.rank, input + return self.rpc_rank, input def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py new file mode 100644 index 0000000000000..22473ce275295 --- /dev/null +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -0,0 +1,36 @@ +import pytest + +from vllm import LLM + +from ...utils import fork_new_process_for_each_test + + +@pytest.mark.parametrize("tp_size", [1, 2]) +@pytest.mark.parametrize("backend", ["mp", "ray"]) +@fork_new_process_for_each_test +def test_collective_rpc(tp_size, backend): + if tp_size == 1 and backend == "ray": + pytest.skip("Skip duplicate test case") + if tp_size == 1: + backend = None + + # intentionally define the method and class in the test function, + # to test if they can be serialized and sent to the workers + def echo_rank(self): + return self.rank + + from vllm.worker.worker import Worker + + class MyWorker(Worker): + + def echo_rank(self): + return self.rank + + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + load_format="dummy", + tensor_parallel_size=tp_size, + distributed_executor_backend=backend, + worker_cls=MyWorker) + for method in ["echo_rank", echo_rank]: + assert llm.collective_rpc(method) == list(range(tp_size)) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 46a064f6d9e68..6ff99f6faa143 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -17,6 +17,33 @@ # generation quality here LORA_NAME = "typeof/zephyr-7b-beta-lora" +BADREQUEST_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + @pytest.fixture(scope="module") def zephyr_lora_files(): @@ -138,32 +165,36 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, @pytest.mark.asyncio -async def test_dynamic_lora_invalid_lora_rank(client: openai.AsyncOpenAI, - tmp_path, zephyr_lora_files): - invalid_rank = tmp_path / "invalid_rank" - - # Copy adapter from zephyr_lora_files to invalid_rank - shutil.copytree(zephyr_lora_files, invalid_rank) - - with open(invalid_rank / "adapter_config.json") as f: +@pytest.mark.parametrize("test_name,config_change,expected_error", + BADREQUEST_CASES) +async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path, + zephyr_lora_files, test_name: str, + config_change: dict, + expected_error: str): + # Create test directory + test_dir = tmp_path / test_name + + # Copy adapter files + shutil.copytree(zephyr_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) - print(adapter_config) - - # assert False - - # Change rank to invalid value - adapter_config["r"] = 1024 - with open(invalid_rank / "adapter_config.json", "w") as f: + # Save modified configuration + with open(config_path, "w") as f: json.dump(adapter_config, f) - with pytest.raises(openai.BadRequestError, - match="is greater than max_lora_rank"): + # Test loading the adapter + with pytest.raises(openai.BadRequestError, match=expected_error): await client.post("load_lora_adapter", cast_to=str, body={ - "lora_name": "invalid-json", - "lora_path": str(invalid_rank) + "lora_name": test_name, + "lora_path": str(test_dir) }) diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index a803ea4a8d6ad..06e0f93dbe269 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,6 +12,9 @@ def server(): args = [ "--enforce-eager", + # Will be used on tests to compare prompt input length + "--max-model-len", + "100" ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -20,8 +23,7 @@ def server(): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." @@ -45,8 +47,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = [ "What is the capital of the United States?", "What is the capital of France?" @@ -73,8 +74,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." @@ -91,3 +91,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.data is not None assert len(score.data) == 1 assert score.data[0].score >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str): + + text_1 = "What is the capital of France?" * 20 + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + score_response.text + + # Test truncation + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + "truncate_prompt_tokens": 101 + }) + assert score_response.status_code == 400 + assert "Please, select a smaller truncation size." in \ + score_response.text diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 8f242df4a60e3..513b466c10d60 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -754,6 +754,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), + ("template_deepseek_vl2.jinja", "string"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 537d95b025a9d..b907af47d08d7 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -3,6 +3,7 @@ import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.utils import WeightsMapper @@ -30,11 +31,14 @@ def test_load_checkpoints( else: expected_lora_modules.append(module) if lora_name == "baichuan7B": + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) # For the baichuan7B model, load it's LoRA, # and the test should pass. LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -43,9 +47,12 @@ def test_load_checkpoints( # Test that the target_modules contain prefix # such as "model.layers.0.self_atten.W_pack", and # the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_zero_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -53,9 +60,12 @@ def test_load_checkpoints( elif lora_name == "baichuan7B-zero-regex": # Test that the `target_modules` in the form of regular expressions, # such as `model\\..*(W_pack|o_proj)`, and the test should pass. + peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files, + max_position_embeddings=4096) LoRAModel.from_local_checkpoint( baichuan_regex_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -64,10 +74,13 @@ def test_load_checkpoints( # For the baichuan7B model, load chatglm3-6b's LoRA, # and the test should raise the following error. expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501 + peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files, + max_position_embeddings=4096) with pytest.raises(ValueError, match=expected_error): LoRAModel.from_local_checkpoint( chatglm3_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, @@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files): ".layers.": ".baichuan_layers.", }, ) + peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index e2daf9d135113..1c0ee01c038d0 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -3,6 +3,7 @@ import pytest from vllm.lora.models import LoRAModel +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.utils import get_adapter_absolute_path from vllm.model_executor.models.llama import LlamaForCausalLM @@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) # lora loading should work for either absolute path and hugggingface id. + peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, expected_lora_modules, + peft_helper=peft_helper, lora_model_id=1, device="cpu", embedding_modules=embedding_modules, diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index ca523c66abe42..9a5b9aabf5078 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,5 +1,3 @@ -import json -import math import os from typing import Dict, List @@ -34,56 +32,6 @@ ] if current_platform.is_cuda_alike() else ["cpu"]) -def test_peft_helper(sql_lora_files): - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - peft_helper = PEFTHelper.from_dict(config) - assert peft_helper.r == 8 - assert peft_helper.lora_alpha == 16 - assert peft_helper.target_modules == [ - "q_proj", - "v_proj", - "k_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - scaling = peft_helper.lora_alpha / peft_helper.r - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - # test RSLoRA - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - peft_helper = PEFTHelper.from_dict(config) - - scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) - assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 - - expected_error = "vLLM only supports modules_to_save being None." - with pytest.raises(ValueError, match=expected_error): - config = dict( - r=8, - lora_alpha=16, - target_modules=["gate_proj"], - modules_to_save=["lm_head"], - ) - PEFTHelper.from_dict(config) - - expected_error = "vLLM does not yet support DoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_dora=True) - PEFTHelper.from_dict(config) - - @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( @@ -91,11 +39,8 @@ def test_from_lora_tensors(sql_lora_files, device): new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) - lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") - with open(lora_config_path) as f: - config = json.load(f) - - peft_helper = PEFTHelper.from_dict(config) + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_lora_tensors( 1, tensors, diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py new file mode 100644 index 0000000000000..a524d5ce5f34a --- /dev/null +++ b/tests/lora/test_peft_helper.py @@ -0,0 +1,109 @@ +import json +import math +import shutil + +import pytest + +from vllm.config import LoRAConfig +from vllm.lora.peft_helper import PEFTHelper + +ERROR_CASES = [ + ( + "test_rank", + { + "r": 1024 + }, + "is greater than max_lora_rank", + ), + ( + "test_bias", + { + "bias": "all" + }, + "Adapter bias cannot be used without bias_enabled", + ), + ("test_dora", { + "use_dora": True + }, "does not yet support DoRA"), + ( + "test_modules_to_save", + { + "modules_to_save": ["lm_head"] + }, + "only supports modules_to_save being None", + ), +] + + +def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): + peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, + max_position_embeddings=4096) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + peft_helper.validate_legal(lora_config) + assert peft_helper.r == 8 + assert peft_helper.lora_alpha == 16 + assert peft_helper.target_modules == [ + "q_proj", + "v_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "embed_tokens", + "lm_head", + ] + assert peft_helper.context_length == 16384 + assert peft_helper.vllm_max_position_embeddings == 4096 + assert peft_helper.vllm_long_context_scaling_factor == float( + math.ceil(peft_helper.context_length / + peft_helper.vllm_max_position_embeddings)) + # test RSLoRA + rslora_config = dict(use_rslora=True) + test_dir = tmp_path / "test_rslora" + shutil.copytree(long_context_lora_files_16k_1, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(rslora_config) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + + peft_helper = PEFTHelper.from_local_dir(test_dir, + max_position_embeddings=4096) + peft_helper.validate_legal(lora_config) + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + +@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES) +def test_peft_helper_error( + sql_lora_files, + tmp_path, + test_name: str, + config_change: dict, + expected_error: str, +): + test_dir = tmp_path / test_name + shutil.copytree(sql_lora_files, test_dir) + + # Load and modify configuration + config_path = test_dir / "adapter_config.json" + with open(config_path) as f: + adapter_config = json.load(f) + # Apply configuration changes + adapter_config.update(config_change) + + # Save modified configuration + with open(config_path, "w") as f: + json.dump(adapter_config, f) + lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) + # Test loading the adapter + with pytest.raises(ValueError, match=expected_error): + PEFTHelper.from_local_dir( + test_dir, max_position_embeddings=4096).validate_legal(lora_config) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0609fd96825e3..9c1f784c1c93b 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output @@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): model_name = "FacebookAI/roberta-base" with vllm_runner(model_name=model_name, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_tokenizer = model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.model.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert not hasattr(model, "lm_head") - assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model._pooler, CLSPool) + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) assert output diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e8b7..2e06b10fbb827 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02253..1ad4f5aae8f5b 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 4e110366a09f3..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -73,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7620ed1107e8f..ca572cc39e538 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -9,6 +9,7 @@ import pytest from transformers import AutoModelForVision2Seq +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform @@ -189,30 +190,27 @@ dtype="bfloat16", ), "deepseek_vl_v2": VLMTestInfo( - models=["deepseek-ai/deepseek-vl2-small"], + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 max_model_len=4096, max_num_seqs=2, single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "\nWhat's the color of the stop sign and car?", - "cherry_blossom": "\nWhat's the color of the tower?", + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nPlease infer the season with reason in details.", # noqa: E501 }), - multi_image_prompt="image_1:\nimage_2:\nDescribe the two images shortly.", # noqa: E501 + multi_image_prompt="image_1:\nimage_2:\nWhich image can we see the car and the tower?", # noqa: E501 vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501 - image_size_factors=[(0.10, 0.15)], patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, postprocess_inputs=model_utils.cast_dtype_post_processor("images"), hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 - num_logprobs=5, + image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], marks=[ pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="Model needs flash-attn for numeric convergence.", - ), - large_gpu_mark(min_gb=48), + TRANSFORMERS_VERSION >= "4.48.0", + reason="HF model is not compatible with transformers>=4.48.0", + ) ], ), "fuyu": VLMTestInfo( diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 16e256e040a74..5a485f3d81747 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -105,17 +104,19 @@ def batch_make_image_embeddings( pixel_values = preprocess_result["pixel_values"] image_grid_thw = preprocess_result["image_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -124,11 +125,10 @@ def batch_make_image_embeddings( for image_batch in image_batches_: cur_batch_image_count = len(image_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in image_grid_thw[image_counter:image_counter + - cur_batch_image_count] - ]) + cur_batch_image_count]) result.append({ "image_embeds": @@ -151,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,17 +187,19 @@ def batch_make_video_embeddings( pixel_values = preprocess_result["pixel_values_videos"] video_grid_thw = preprocess_result["video_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -206,11 +208,10 @@ def batch_make_video_embeddings( for video_batch in video_batches_: cur_batch_video_count = len(video_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in video_grid_thw[video_counter:video_counter + - cur_batch_video_count] - ]) + cur_batch_video_count]) result.append({ "video_embeds": @@ -280,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22f69..0cbe4afe96c0a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 04ab4dd7371a3..e17198e385475 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -17,14 +17,15 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), - # [Encoder-decoder] - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-decoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @@ -61,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0a38779e0e4f0..1e3e7ea50b122 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,6 +22,8 @@ def _test_processing_correctness( ): if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} + elif model_id == "deepseek-ai/deepseek-vl2-tiny": + hf_overrides = {"architectures": ["DeepseekVLV2ForCausalLM"]} else: hf_overrides = {} @@ -139,6 +141,7 @@ def _test_processing_correctness( ("rhymes-ai/Aria", {"image": True}), ("Salesforce/blip2-opt-2.7b", {"image": False}), ("facebook/chameleon-7b", {"image": False}), + ("deepseek-ai/deepseek-vl2-tiny", {"image": True}), ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index 1eec35d9c3c72..6de649f87204d 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -13,6 +13,67 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index 94ea604c58b43..806437d35ec87 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -13,6 +13,68 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/tests/models/registry.py b/tests/models/registry.py index b0f0f9767a90f..9603ea8817cac 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -69,6 +69,7 @@ class _HfExamplesInfo: "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), @@ -154,6 +155,7 @@ class _HfExamplesInfo: "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 @@ -181,8 +183,7 @@ class _HfExamplesInfo: trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), - # TODO(Isotr0py): Use deepseek-vl2-tiny for test after it's supported - "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-small"), # noqa: E501 + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 92436889ecffe..0cd86cef0a475 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -30,50 +30,55 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj - - # assert zp for symmetric and asymmetric cases - def zp_valid(zp: Optional[torch.Tensor]): - if is_symmetric: - return zp is None - - return zp is not None and zp.dtype is torch.int32 - - assert zp_valid(qkv_proj.input_zero_point) - assert zp_valid(o_proj.input_zero_point) - assert zp_valid(gate_up_proj.input_zero_point) - assert zp_valid(down_proj.input_zero_point) - - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.is_static_input_scheme - expected_type = torch.int8 - - assert qkv_proj.weight.dtype is expected_type - assert o_proj.weight.dtype is expected_type - assert gate_up_proj.weight.dtype is expected_type - - if qkv_proj.scheme.strategy == "tensor": - # Make sure it is a channelwise buffer - # After running process_weights_after_loading - assert len(qkv_proj.weight_scale.shape) == 2 - assert qkv_proj.weight_scale.shape[0] == shape_0 - assert qkv_proj.weight_scale.shape[1] == 1 - assert qkv_proj.weight_scale.dtype is torch.float32 - assert qkv_proj.input_scale.dtype is torch.float32 + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None + + return zp is not None and zp.dtype is torch.int32 + + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.is_static_input_scheme + expected_type = torch.int8 + + assert qkv_proj.weight.dtype is expected_type + assert o_proj.weight.dtype is expected_type + assert gate_up_proj.weight.dtype is expected_type + + if qkv_proj.scheme.strategy == "tensor": + # Make sure it is a channelwise buffer + # After running process_weights_after_loading + assert len(qkv_proj.weight_scale.shape) == 2 + assert qkv_proj.weight_scale.shape[0] == shape_0 + assert qkv_proj.weight_scale.shape[1] == 1 + assert qkv_proj.weight_scale.dtype is torch.float32 + assert qkv_proj.input_scale.dtype is torch.float32 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert not qkv_proj.scheme.is_static_input_scheme + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.weight.dtype is torch.int8 - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert not qkv_proj.scheme.is_static_input_scheme - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.weight.dtype is torch.int8 + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) + def check_model(model): + layer = model.model.layers[0] - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.group_size == (-1 if group is None else group) + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) - assert qkv_proj.weight_packed.dtype is torch.int32 - assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.scheme.pack_factor == pack_factor + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == (-1 + if group is None else group) + + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.scheme.pack_factor == pack_factor + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) - assert qkv_proj.weight_packed.dtype is torch.int32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance( - qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.input_scale.dtype is torch.float32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) - if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - assert qkv_proj.weight_scale.dtype is torch.float32 - assert len(qkv_proj.weight_scale.shape) == 0 + assert qkv_proj.input_scale.dtype is torch.float32 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.int8 - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensors24) - - assert qkv_proj.scheme.weight_quant is None - assert qkv_proj.scheme.input_quant is None - assert not qkv_proj.scheme.quantized - assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" - assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a0c1d7e24c503..4bff734746297 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - attn = model.model.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - # NOTE: it is valid for scales to be 1.0 (default value), but we know - # these checkpoints have scales < 1.0 - assert 0.0 < attn._k_scale < 1.0 - assert 0.0 < attn._v_scale < 1.0 + def check_model(model): + attn = model.model.layers[0].self_attn.attn + + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + + # NOTE: it is valid for scales to be 1.0 (default value), but + # we know these checkpoints have scales < 1.0 + assert 0.0 < attn._k_scale < 1.0 + assert 0.0 < attn._v_scale < 1.0 + + llm.apply_model(check_model) # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - if kv_cache_dtype == "fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - if current_platform.has_device_capability(89) and not force_marlin: - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fn - else: - # For GPUs without hardware support, we pack the fp8 weights - # for weight-only quantization using Marlin kernels - assert fc1.weight.dtype == torch.int32 + def check_model(model): + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 + + if current_platform.has_device_capability(89) and not force_marlin: + # For GPUs with hardware support, we keep weights in fp8 + assert fc1.weight.dtype == torch.float8_e4m3fn + else: + # For GPUs without hardware support, we pack the fp8 weights + # for weight-only quantization using Marlin kernels + assert fc1.weight.dtype == torch.int32 + + llm.apply_model(check_model) @pytest.mark.skipif(not is_quant_method_supported("fp8"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ad526a4065101..fa2d9645ea47f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -28,20 +28,23 @@ def test_lm_head( model_lm_head_quant: Tuple[str, bool], ) -> None: model, lm_head_quantized = model_lm_head_quant - vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) - - lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model.lm_head) - - if lm_head_quantized: - assert isinstance( - lm_head_layer.linear_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) - else: - assert isinstance(lm_head_layer.linear_method, - UnquantizedEmbeddingMethod) - - print( - vllm_model.generate_greedy(prompts=["Hello my name is"], - max_tokens=10)[0][1]) - del vllm_model + + with vllm_runner(model, dtype=torch.float16, + max_model_len=2048) as vllm_model: + + def check_model(model): + lm_head_layer = model.lm_head + + if lm_head_quantized: + assert isinstance(lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, + MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, + UnquantizedEmbeddingMethod) + + vllm_model.apply_model(check_model) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 27493a682b746..11382ad708faa 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -12,19 +12,22 @@ def test_quark_fp8(vllm_runner): model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + qkv_proj = layer.self_attn.qkv_proj - if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz - assert len(qkv_proj.weight_scale.shape) == 0 + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py new file mode 100644 index 0000000000000..8e7f44a399ddf --- /dev/null +++ b/tests/quantization/test_register_quantization_config.py @@ -0,0 +1,117 @@ +"""Tests register custom quantization config. + +See https://github.com/vllm-project/vllm/issues/11926 for more details. + +Run `pytest tests/quantization/test_register_quantization_config.py`. +""" +from typing import Any, Dict, List, Optional + +import pytest +import torch +import torch.nn.functional as F + +from vllm.model_executor.layers.linear import LinearBase # noqa: E501 +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import ( + get_quantization_config, register_quantization_config) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig) + + +class FakeQuantLinearMethod(UnquantizedLinearMethod): + """Fake quantization linear method for per-token dynamic quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization method.""" + super().__init__() + self.num_bits = num_bits + + def apply(self, + layer: "torch.nn.Module", + x: "torch.Tensor", + bias: Optional["torch.Tensor"] = None) -> "torch.Tensor": + """Perform fake quantization before the linear layer.""" + + # Calculate the scales dynamically + max_val = torch.amax(x, dim=(0, -1), keepdims=True) + min_val = torch.amin(x, dim=(0, -1), keepdims=True) + scales = (max_val - min_val) / (2**self.num_bits - 1) + + # Fake quantize the input + quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1), + 2**(self.num_bits - 1) - 1) + dequant_x = quant_x * scales + + return F.linear(dequant_x, layer.weight, bias) + + +@register_quantization_config("custom_quant") +class CustomQuantConfig(QuantizationConfig): + """Custom quantization config for per-token dynamic fake quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization config.""" + self.num_bits = num_bits + + def get_name(self) -> str: + """Name of the quantization method.""" + return "custom_quant" + + def get_supported_act_dtypes(self) -> List["torch.dtype"]: + """List of supported activation dtypes.""" + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method.""" + return -1 + + @staticmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + """Create a config class from the model's quantization config.""" + return CustomQuantConfig(num_bits=config.get("num_bits", 8)) + + def get_quant_method(self, layer: "torch.nn.Module", + prefix: str) -> Optional["FakeQuantLinearMethod"]: + """Get the quantize method to use for the quantized layer.""" + if isinstance(layer, LinearBase): + return FakeQuantLinearMethod(num_bits=self.num_bits) + return None + + +def test_register_quantization_config(): + """Test register custom quantization config.""" + + # The quantization method `custom_quant` should be registered. + assert get_quantization_config("custom_quant") == CustomQuantConfig + + # The quantization method `custom_quant` is already exists, + # should raise an error. + with pytest.raises(ValueError): + register_quantization_config("custom_quant")(CustomQuantConfig) + + +@pytest.mark.parametrize(argnames="model", + argvalues=[ + "meta-llama/Meta-Llama-3-8B-Instruct", + ]) +def test_custom_quant(vllm_runner, model): + """Test infer with the custom quantization method.""" + with vllm_runner(model_name=model, + quantization="custom_quant", + enforce_eager=True) as llm: + + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj + + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index bf409d2d97aa1..6e7eec1c6ab34 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -3,6 +3,7 @@ import os import pathlib import subprocess +from functools import partial from unittest.mock import MagicMock, patch import openai @@ -24,7 +25,6 @@ # yapf: enable from vllm.utils import PlaceholderModule, import_from_path -from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip @@ -58,16 +58,6 @@ def is_curl_installed(): return False -def get_torch_model(vllm_runner: VllmRunner): - return vllm_runner \ - .model \ - .llm_engine \ - .model_executor \ - .driver_worker \ - .model_runner \ - .model - - def write_keyfile(keyfile_path: str): encryption_params = EncryptionParams.random() pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) @@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - serialize_vllm_model(get_torch_model(vllm_model), - config_for_serializing) + + vllm_model.apply_model( + partial(serialize_vllm_model, + tensorizer_config=config_for_serializing)) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) with vllm_runner( model_ref, @@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) model_loader_extra_config = { "tensorizer_uri": str(model_path), @@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(get_torch_model(vllm_model), config) + + vllm_model.apply_model( + partial(serialize_vllm_model, tensorizer_config=config)) assert is_vllm_tensorized(config) diff --git a/tests/test_utils.py b/tests/test_utils.py index c68d730af7f8a..d5dc4464e634d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,10 +9,10 @@ from vllm_test_utils import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.utils import (FlexibleArgumentParser, PlaceholderModule, - StoreBoolean, bind_kv_cache, deprecate_kwargs, - get_open_port, memory_profiling, merge_async_iterators, - supports_kw) +from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, + PlaceholderModule, StoreBoolean, bind_kv_cache, + deprecate_kwargs, get_open_port, memory_profiling, + merge_async_iterators, supports_kw) from .utils import error_on_warning, fork_new_process_for_each_test @@ -284,14 +284,13 @@ def test_memory_profiling(): # 512 MiB allocation outside of this instance handle1 = lib.cudaMalloc(512 * 1024 * 1024) - baseline_memory_in_bytes = \ - torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0] + baseline_snapshot = MemorySnapshot() # load weights weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32) - weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() @@ -300,8 +299,8 @@ def measure_current_non_torch(): current_non_torch = current_used - current_torch return current_non_torch - with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, - weights_memory_in_bytes=weights_memory_in_bytes) as result, \ + with memory_profiling(baseline_snapshot=baseline_snapshot, + weights_memory=weights_memory) as result, \ monitor(measure_current_non_torch) as monitored_values: # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) @@ -316,13 +315,12 @@ def measure_current_non_torch(): assert measured_diff == 256 * 1024 * 1024 # Check that the memory usage is within 5% of the expected values - # 5% tolerance is caused by PyTorch caching allocator, - # we cannot control PyTorch's behavior of its internal buffers, + # 5% tolerance is caused by cuda runtime. + # we cannot control cuda runtime in the granularity of bytes, # which causes a small error (<10 MiB in practice) - non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa - torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa + non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 - assert abs(torch_peak_ratio - 1) <= 0.05 + assert result.torch_peak_increase == 1024 * 1024 * 1024 del weights lib.cudaFree(handle1) lib.cudaFree(handle2) diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py new file mode 100644 index 0000000000000..ac773b611f406 --- /dev/null +++ b/tests/v1/test_utils.py @@ -0,0 +1,62 @@ +from typing import List + +import torch + +from vllm.v1.utils import bind_kv_cache + + +def test_bind_kv_cache(): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + 'layers.1.self_attn': Attention(32, 128, 0.1), + 'layers.2.self_attn': Attention(32, 128, 0.1), + 'layers.3.self_attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'layers.0.self_attn': torch.zeros((1, )), + 'layers.1.self_attn': torch.zeros((1, )), + 'layers.2.self_attn': torch.zeros((1, )), + 'layers.3.self_attn': torch.zeros((1, )), + } + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ + 'layers.0.self_attn'] + assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[ + 'layers.1.self_attn'] + assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[ + 'layers.2.self_attn'] + assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[ + 'layers.3.self_attn'] + + assert runner_kv_caches[0] is kv_cache['layers.0.self_attn'] + assert runner_kv_caches[1] is kv_cache['layers.1.self_attn'] + assert runner_kv_caches[2] is kv_cache['layers.2.self_attn'] + assert runner_kv_caches[3] is kv_cache['layers.3.self_attn'] + + +def test_bind_kv_cache_non_attention(): + from vllm.attention import Attention + + # example from Jamba PP=2 + ctx = { + 'model.layers.20.attn': Attention(32, 128, 0.1), + 'model.layers.28.attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'model.layers.20.attn': torch.zeros((1, )), + 'model.layers.28.attn': torch.zeros((1, )), + } + + runner_kv_caches: List[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + + assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ + 'model.layers.20.attn'] + assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[ + 'model.layers.28.attn'] + + assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] + assert runner_kv_caches[1] is kv_cache['model.layers.28.attn'] diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index a06956ce18a93..272206d4502e9 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main qqq, HandH1998/QQQ-Llama-3-8b-g128, main qqq, HandH1998/QQQ-Llama-3-8b, main -hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main \ No newline at end of file +hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main +None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 199731bdc21fe..7a3786456d0d6 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner): """ Test parameter weight loading with tp>1. """ - with vllm_runner(model_name=MODEL_NAME, - revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", - quantization=QUANTIZATION, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=2) as model: + with vllm_runner( + model_name=MODEL_NAME, + revision=REVISION, + dtype=torch.half if QUANTIZATION == "gptq" else "auto", + quantization=None if QUANTIZATION == "None" else QUANTIZATION, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=2) as model: output = model.generate_greedy("Hello world!", max_tokens=20) print(output) diff --git a/tools/actionlint.sh b/tools/actionlint.sh deleted file mode 100755 index f6a8b5e83a2de..0000000000000 --- a/tools/actionlint.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if command -v actionlint &> /dev/null; then - actionlint "$@" - exit 0 -elif [ -x ./actionlint ]; then - ./actionlint "$@" - exit 0 -fi - -# download a binary to the current directory - v1.7.3 -bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint "$@" diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh deleted file mode 100755 index 19a55ddfa91c4..0000000000000 --- a/tools/doc-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -pymarkdownlnt scan docs -r diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index d99fa77b96351..7efb3cabc64fe 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' +find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' diff --git a/vllm/__init__.py b/vllm/__init__.py index 45252b93e3d54..a533dba561c00 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -17,6 +17,44 @@ from .version import __version__, __version_tuple__ + +def configure_as_vllm_process(): + """ + set some common config/environment variables that should be set + for all processes created by vllm and all processes + that interact with vllm workers. + """ + import os + + import torch + + # see https://github.com/NVIDIA/nccl/issues/1234 + os.environ['NCCL_CUMEM_ENABLE'] = '0' + + # see https://github.com/vllm-project/vllm/issues/10480 + os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' + # see https://github.com/vllm-project/vllm/issues/10619 + torch._inductor.config.compile_threads = 1 + + from vllm.platforms import current_platform + + if current_platform.is_xpu(): + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True + elif current_platform.is_hpu(): + # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) + # does not support torch.compile + # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for + # torch.compile support + is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' + if is_lazy: + torch._dynamo.config.disable = True + # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) + # requires enabling lazy collectives + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' + + __all__ = [ "__version__", "__version_tuple__", @@ -42,4 +80,5 @@ "AsyncEngineArgs", "initialize_ray_cluster", "PoolingParams", + "configure_as_vllm_process", ] diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 9b03fd73fe690..e2403306950a3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -101,7 +101,9 @@ def __init__( self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads + self.sliding_window = sliding_window self.backend = backend_name_to_enum(attn_backend.get_name()) + self.dtype = dtype # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 87655530cead4..955c25f300512 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -25,23 +25,30 @@ logger = init_logger(__name__) +@dataclasses.dataclass +class InductorArtifact: + hash_str: str = "" + file_path: str = "" + + class InductorHashCache: """ Disk format: a Python list of tuples, each tuple is - (runtime_shape, graph_index, hash_str) + (runtime_shape, graph_index, hash_str, file_path) We use list of tuple for readability. In-memory format: a defaultdict of dict, where the key is runtime_shape, and the value is a dict of graph_index to hash_str. - The data is essentially `Dict[Optional[int], Dict[int, str]]`, + The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`, we don't use json here because json doesn't support int as key. TODO: better off-the-shelf solution to serialize the data? """ def __init__(self, cache_dir: str, disabled: bool = False): - self.cache: defaultdict = defaultdict(dict) + self.cache: Dict[Optional[int], + Dict[int, InductorArtifact]] = defaultdict(dict) self.disabled = disabled self.cache_dir = cache_dir self.cache_file_path = os.path.join(cache_dir, @@ -66,14 +73,25 @@ def deserialize(self, data: str): # because it is a safe way to parse Python literals. # do not use eval(), it is unsafe. list_data = ast.literal_eval(data) - for runtime_shape, graph_index, hash_str in list_data: - self.cache[runtime_shape][graph_index] = hash_str + for item in list_data: + runtime_shape = item[0] + graph_index = item[1] + hash_str = item[2] + # for compatibility of old version, + # where we don't have file_path. + # NOTE: after running the new code, the file_path + # will be updated. + file_path = "" if len(item) == 3 else item[3] + self.cache[runtime_shape][graph_index] = InductorArtifact( + hash_str=hash_str, file_path=file_path) def serialize(self) -> str: data = [] - for runtime_shape, graph_index_to_hash_str in self.cache.items(): - for graph_index, hash_str in graph_index_to_hash_str.items(): - data.append((runtime_shape, graph_index, hash_str)) + for runtime_shape, value in self.cache.items(): + for graph_index, inductor_artifact in value.items(): + data.append( + (runtime_shape, graph_index, inductor_artifact.hash_str, + inductor_artifact.file_path)) printer = pprint.PrettyPrinter(indent=4) return printer.pformat(data) @@ -90,13 +108,14 @@ def __contains__(self, key: Tuple[Optional[int], int]) -> bool: return runtime_shape in self.cache and graph_index in self.cache[ runtime_shape] - def __getitem__(self, key: Tuple[Optional[int], int]) -> str: + def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact: if self.disabled: raise KeyError("cannot read from disabled cache") runtime_shape, graph_index = key return self.cache[runtime_shape][graph_index] - def __setitem__(self, key: Tuple[Optional[int], int], value: str): + def __setitem__(self, key: Tuple[Optional[int], int], + value: InductorArtifact): # setitem for disabled cache is fine, because we # don't actually write to the disk runtime_shape, graph_index = key @@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule, if (runtime_shape, graph_index) in cache_data: # we compiled this graph before # so we can directly lookup the compiled graph via hash - hash_str = cache_data[(runtime_shape, graph_index)] + inductor_artifact = cache_data[(runtime_shape, graph_index)] + hash_str = inductor_artifact.hash_str if graph_index == 0: # adds some info logging for the first graph logger.info( @@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule, "Inductor cache lookup failed. Please remove" f"the cache file {cache_data.cache_file_path} and try again." # noqa ) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa # Inductor calling convention (function signature): # f(list) -> tuple @@ -224,19 +245,20 @@ def compiled_graph(*args): # the assumption is that we don't have nested Inductor compilation. # compiled_fx_graph_hash will only be called once, and we can hook # it to get the hash of the compiled graph directly. - from torch._inductor.codecache import compiled_fx_graph_hash + + inductor_artifact = InductorArtifact() + from torch._inductor.codecache import (FxGraphCache, + compiled_fx_graph_hash) + original_load = FxGraphCache.load + + def hijack_load(*args, **kwargs): + inductor_compiled_graph = original_load(*args, **kwargs) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa + return inductor_compiled_graph def hijack_compiled_fx_graph_hash(*args, **kwargs): out = compiled_fx_graph_hash(*args, **kwargs) - # store the hash in the cache - nonlocal cache_data - cache_data[(runtime_shape, graph_index)] = out[0] - if graph_index == 0: - # adds some info logging for the first graph - logger.info("Cache the graph of shape %s for later use", - str(runtime_shape)) - logger.debug("store the %s-th graph for shape %s via hash %s", - graph_index, str(runtime_shape), out[0]) + inductor_artifact.hash_str = out[0] return out def _check_can_cache(*args, **kwargs): @@ -251,19 +273,45 @@ def _check_can_cache(*args, **kwargs): def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() - with patch(# for hijacking the hash of the compiled graph - "torch._inductor.codecache.compiled_fx_graph_hash", - hijack_compiled_fx_graph_hash), \ - patch(# for providing a dummy shape environment - "torch._inductor.codecache.FxGraphCache._get_shape_env", - _get_shape_env), \ - patch(# for forcing the graph to be cached - "torch._inductor.codecache.FxGraphCache._check_can_cache", - _check_can_cache): + with ExitStack() as stack: + if not cache_data.disabled: + # compilation cache is enabled, patch several functions + + # hijack to get the compiled graph itself + stack.enter_context( + patch("torch._inductor.codecache.FxGraphCache.load", + hijack_load)) + + # for hijacking the hash of the compiled graph + stack.enter_context( + patch("torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash)) + + # for providing a dummy shape environment + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env)) + + # for forcing the graph to be cached + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache)) + compiled_graph = compile_fx(graph, example_inputs, config_patches=current_config) - + # store the inductor_artifact in the cache + cache_data[(runtime_shape, graph_index)] = inductor_artifact + if graph_index == 0: + # adds some info logging for the first graph + logger.info("Cache the graph of shape %s for later use", + str(runtime_shape)) + logger.debug( + "store the %s-th graph for shape %s via hash %s from file %s", + graph_index, str(runtime_shape), inductor_artifact.hash_str, + inductor_artifact.file_path) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: now = time.time() @@ -576,9 +624,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: ] # index of tensors that have symbolic shapes (batch size) + # for weights and static buffers, they will have concrete shapes. + # symbolic shape only happens for input tensors. + from torch.fx.experimental.symbolic_shapes import is_symbolic self.sym_tensor_indices = [ i for i, x in enumerate(fake_args) - if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \ + any(is_symbolic(d) for d in x.size()) ] # compiler managed cudagraph input buffers diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 10513111ea7f1..38f284794b8db 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -76,8 +76,8 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): During runtime, when we actually mark dimensions of tensors, it depends on the value of arguments: - - if it is a single integer, the corresponding dimension of the argument - will be marked as dynamic. + - if it is a single integer (can be negative), the corresponding dimension + of the argument will be marked as dynamic. - if it is `None`, ignored. - if it is `IntermediateTensors`, all the tensors in the intermediate tensors will be marked as dynamic. @@ -177,10 +177,20 @@ def __call__(self, *args, **kwargs): for k, dims in dynamic_arg_dims.items(): arg = bound_args.arguments.get(k) if arg is not None: + dims = [dims] if isinstance(dims, int) else dims if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [ + arg.ndim + dim if dim < 0 else dim for dim in dims + ] torch._dynamo.mark_dynamic(arg, dims) elif isinstance(arg, IntermediateTensors): for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [ + tensor.ndim + dim if dim < 0 else dim + for dim in dims + ] torch._dynamo.mark_dynamic(tensor, dims) else: raise ValueError( diff --git a/vllm/config.py b/vllm/config.py index 2fe674b857e16..4698a05020332 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -607,10 +607,12 @@ def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - if (self.hf_config.model_type == 'deepseek_v3' + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH and not self.enforce_eager): - logger.warning("CUDA graph is not supported for Deepseek V3 yet, " - "fallback to the eager mode.") + logger.warning( + "CUDA graph is not supported for %s yet, fallback to the eager " + "mode.", self.hf_config.model_type) self.enforce_eager = True def _verify_bnb_config(self) -> None: @@ -1338,14 +1340,15 @@ def _verify_args(self) -> None: from vllm.executor.executor_base import ExecutorBase from vllm.platforms import current_platform if self.distributed_executor_backend not in ( - "ray", "mp", "uni", None) and not (isinstance( + "ray", "mp", "uni", + "external_launcher", None) and not (isinstance( self.distributed_executor_backend, type) and issubclass( self.distributed_executor_backend, ExecutorBase)): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' 'uni', or custom ExecutorBase" - " subclass.") + "values are 'ray', 'mp' 'uni', 'external_launcher' or" + " custom ExecutorBase subclass.") if self.use_ray: from vllm.executor import ray_utils ray_utils.assert_ray_available() @@ -2859,17 +2862,8 @@ def model_post_init(self, __context: Any) -> None: "vllm.unified_attention_with_output", ] else: - # v0 can use full graph compilation without splitting, - # splitting is optional. - # right now we still need it. kv cache shape - # will be included in the graph if we don't split - # the graph. - # TODO: hide kv cache in static forward context - # so that inductor does not see it. - self.splitting_ops = [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ] + # v0 uses full graph compilation + self.splitting_ops = [] for k, v in self.inductor_passes.items(): if not isinstance(v, str): @@ -3171,7 +3165,8 @@ def __post_init__(self): if self.compilation_config is None: self.compilation_config = CompilationConfig() - if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: + if envs.VLLM_USE_V1 and self.model_config is not None and \ + not self.model_config.enforce_eager: # NOTE(woosuk): Currently, we use inductor because the piecewise # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 4ace03ff1184e..7780e2dfa317d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -35,6 +35,7 @@ def __init__( ): self.config = config.kv_transfer_config + self.tp_size = config.parallel_config.tensor_parallel_size if self.config.kv_connector == "PyNcclConnector": from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( @@ -161,7 +162,7 @@ def send_kv_caches_and_hidden_states( end_layer = model_executable.model.end_layer model_config = model_executable.model.config - num_heads = model_config.num_key_value_heads + num_heads = int(model_config.num_key_value_heads / self.tp_size) hidden_size = model_config.hidden_size num_attention_heads = model_config.num_attention_heads head_size = int(hidden_size / num_attention_heads) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 03a8959a7d9ff..a4f4c9558d056 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -388,7 +388,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Parallel arguments parser.add_argument( '--distributed-executor-backend', - choices=['ray', 'mp'], + choices=['ray', 'mp', 'uni', 'external_launcher'], default=EngineArgs.distributed_executor_backend, help='Backend to use for distributed model ' 'workers, either "ray" or "mp" (multiprocessing). If the product ' diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 49a1e9f505d9f..6a6b4a14a4c49 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -457,6 +457,11 @@ def _get_executor_cls(cls, # JAX-style, single-process, multi-device executor. from vllm.executor.uniproc_executor import UniProcExecutor executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # executor with external launcher + from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher) + executor_class = ExecutorWithExternalLauncher else: from vllm.executor.uniproc_executor import UniProcExecutor executor_class = UniProcExecutor @@ -684,7 +689,9 @@ def add_request( :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + lora_request: The LoRA request to add. trace_headers: OpenTelemetry trace headers. + prompt_adapter_request: The prompt adapter request to add. priority: The priority of the request. Only applicable with priority scheduling. diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 8f231de912c95..3aa9d30549f36 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -296,6 +296,7 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest): is_engine_errored=False, exception=e) self._send_outputs(rpc_err) + return # Otherwise, send back the successful load message self._send_outputs( RPCAdapterLoadedResponse(request_id=request.request_id)) diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index da3185f33dbe9..55c56abea0da3 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -102,9 +102,9 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, Args: seq_group: the output is associated with this :class:`SequenceGroup` - output: the :class:`SequenceGroupOutput` for a single scheduler step + outputs: the :class:`SequenceGroupOutput` for a single scheduler step """ - assert len(outputs) == 1, ("Single step should only has 1 output.") + assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] assert isinstance(output, CompletionSequenceGroupOutput) single_step_process_prompt_logprob(self, seq_group, output) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index acb4db85632a8..27386daa4bbc9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,11 +1,13 @@ import itertools import warnings from contextlib import contextmanager -from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, - Union, cast, overload) +from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence, + Tuple, Type, Union, cast, overload) +import cloudpickle +import torch.nn as nn from tqdm import tqdm -from typing_extensions import deprecated +from typing_extensions import TypeVar, deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -41,6 +43,8 @@ logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -186,6 +190,13 @@ def __init__( if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + if "worker_cls" in kwargs: + worker_cls = kwargs["worker_cls"] + # if the worker_cls is not qualified string name, + # we serialize it using cloudpickle to avoid pickling issues + if isinstance(worker_cls, type): + kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if compilation_config is not None: if isinstance(compilation_config, (int, dict)): compilation_config_instance = CompilationConfig.from_cli( @@ -455,6 +466,44 @@ def generate( outputs = self._run_engine(use_tqdm=use_tqdm) return self.engine_class.validate_outputs(outputs, RequestOutput) + def collective_rpc(self, + method: Union[str, Callable[..., _R]], + timeout: Optional[float] = None, + args: Tuple = (), + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + executor = self.llm_engine.model_executor + return executor.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + executor = self.llm_engine.model_executor + return executor.apply_model(func) + def beam_search( self, prompts: List[Union[TokensPrompt, TextPrompt]], diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 88859255f202a..3da447be06430 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -203,15 +203,19 @@ def _validate_input( ) -> TextTokensPrompt: token_num = len(input_ids) - # Note: EmbeddingRequest doesn't have max_tokens - if isinstance(request, - (EmbeddingChatRequest, EmbeddingCompletionRequest)): + # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens + if isinstance( + request, + (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest)): + + operation = "score" if isinstance(request, ScoreRequest) \ + else "embedding generation" if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " - f"{token_num} tokens in the input for embedding " - f"generation. Please reduce the length of the input.") + f"{token_num} tokens in the input for {operation}. " + f"Please reduce the length of the input.") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index a222eafadcb68..fc422f0917bd5 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -157,24 +157,16 @@ async def load_lora_adapter( # This will also pre-load it for incoming requests try: await self.engine_client.add_lora(lora_request) - except ValueError as e: - # Adapter not found or lora configuration errors - if "No adapter found" in str(e): - return create_error_response(message=str(e), - err_type="NotFoundError", - status_code=HTTPStatus.NOT_FOUND) - else: - return create_error_response( - message=str(e), - err_type="BadRequestError", - status_code=HTTPStatus.BAD_REQUEST) except BaseException as e: - # Some other unexpected problem loading the adapter, e.g. malformed - # input files. - # More detailed error messages for the user would be nicer here + error_type = "BadRequestError" + status_code = HTTPStatus.BAD_REQUEST + if isinstance(e, ValueError) and "No adapter found" in str(e): + error_type = "NotFoundError" + status_code = HTTPStatus.NOT_FOUND + return create_error_response(message=str(e), - err_type="BadRequestError", - status_code=HTTPStatus.BAD_REQUEST) + err_type=error_type, + status_code=status_code) self.lora_requests.append(lora_request) logger.info("Loaded new LoRA adapter: name '%s', path '%s'", lora_name, diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 5d3e7139d7a17..381edf8fac49e 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,6 +101,38 @@ async def create_score( if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + + input_pairs = make_pairs(request.text_1, request.text_2) + for q, t in input_pairs: + request_prompt = f"{q}{tokenizer.sep_token}{t}" + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=q, + text_pair=t, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -108,28 +140,6 @@ async def create_score( # Schedule the request and get the result generator. generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] - input_pairs = make_pairs(request.text_1, request.text_2) - - for q, t in input_pairs: - request_prompt = f"{q}{tokenizer.sep_token}{t}" - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - prompt_inputs = await tokenize_async(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) - try: pooling_params = request.to_pooling_params() diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 00ecadcf92667..859e105f15d97 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,6 +1,10 @@ import asyncio from abc import ABC, abstractmethod -from typing import Any, Awaitable, Dict, List, Optional, Set, Tuple, Union +from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, + Union) + +import torch.nn as nn +from typing_extensions import TypeVar from vllm.config import VllmConfig from vllm.logger import init_logger @@ -10,9 +14,12 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class ExecutorBase(ABC): """Base class for all executors. @@ -43,22 +50,37 @@ def __init__( @abstractmethod def _init_executor(self) -> None: - pass + raise NotImplementedError @abstractmethod def collective_rpc(self, - method: str, + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: """ - The main interface of the executor to run a method on all workers, - with homogeneous arguments. - If the args are heterogeneous, then we can pack them into a list, - and unpack them in the method of every worker, because every worker - knows their own rank. + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ - pass + raise NotImplementedError def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and @@ -78,16 +100,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: b = min([r[1] for r in results]) return a, b - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - For V1 compatibility. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: """Initialize the KV cache by invoking the underlying worker. """ @@ -106,6 +118,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) + def execute_model( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: @@ -260,7 +283,7 @@ def _driver_execute_model( raise NotImplementedError def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: @@ -269,7 +292,7 @@ def collective_rpc(self, @abstractmethod def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index d9dde949b844a..78c86321d861d 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,5 +1,8 @@ import asyncio -from typing import Any, List, Optional +import os +from typing import Any, Callable, List, Optional, Union + +import cloudpickle from vllm.executor.executor_base import DistributedExecutorBase from vllm.executor.multiproc_worker_utils import ( @@ -8,8 +11,9 @@ from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, make_async) +from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, + make_async, run_method, update_environment_variables) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -20,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase): uses_ray: bool = False + def _check_cuda(self) -> None: + """Check that the number of GPUs is sufficient for the parallel + configuration. Separate from _init_executor to reduce the number of + indented blocks. + """ + parallel_config = self.parallel_config + world_size = parallel_config.world_size + tensor_parallel_size = parallel_config.tensor_parallel_size + + cuda_device_count = cuda_device_count_stateless() + # Use confusing message for more common TP-only case. + if tensor_parallel_size > cuda_device_count: + raise RuntimeError( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local gpu count ({cuda_device_count})") + + if world_size > cuda_device_count: + raise RuntimeError( + f"please ensure that world_size ({world_size}) " + f"is less than than max local gpu count ({cuda_device_count})") + + # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers + if "CUDA_VISIBLE_DEVICES" not in os.environ: + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) + def _init_executor(self) -> None: + + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + self._check_cuda() + # Create the parallel GPU workers. world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -107,12 +143,12 @@ def _driver_execute_model( def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> Any: + ) -> List[Any]: """Runs the given method on all workers. Args: @@ -121,6 +157,11 @@ def _run_workers( It will also be run asynchronously and return a list of futures rather than blocking on the results. """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if max_concurrent_workers: raise NotImplementedError( @@ -129,18 +170,18 @@ def _run_workers( if async_run_tensor_parallel_workers_only: # Run only non-driver workers and just return futures. return [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.non_driver_workers ] # Start all remote workers first. worker_outputs = [ - worker.execute_method(method, *args, **kwargs) + worker.execute_method(sent_method, *args, **kwargs) for worker in self.workers ] - driver_worker_method = getattr(self.driver_worker, method) - driver_worker_output = driver_worker_method(*args, **kwargs) + driver_worker_output = run_method(self.driver_worker, sent_method, + args, kwargs) # Get the results of the workers. return [driver_worker_output diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index c9fb3c664c575..539b6ae2d3572 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -15,7 +15,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.triton_utils.importing import HAS_TRITON -from vllm.utils import _check_multiproc_method, get_mp_context +from vllm.utils import _check_multiproc_method, get_mp_context, run_method if HAS_TRITON: from vllm.triton_utils import maybe_set_triton_cache_manager @@ -169,7 +169,7 @@ def __init__(self, result_handler: ResultHandler, self.process.start() def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], - method: str, args, kwargs): + method: Union[str, bytes], args, kwargs): task_id = uuid.uuid4() self.tasks[task_id] = future try: @@ -180,12 +180,13 @@ def _enqueue_task(self, future: Union[ResultFuture, asyncio.Future], del self.tasks[task_id] raise ChildProcessError("worker died") from e - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): future: ResultFuture = ResultFuture() self._enqueue_task(future, method, args, kwargs) return future - async def execute_method_async(self, method: str, *args, **kwargs): + async def execute_method_async(self, method: Union[str, bytes], *args, + **kwargs): future = asyncio.get_running_loop().create_future() self._enqueue_task(future, method, args, kwargs) return await future @@ -230,8 +231,7 @@ def _run_worker_process( exception = None task_id, method, args, kwargs = items try: - executor = getattr(worker, method) - output = executor(*args, **kwargs) + output = run_method(worker, method, args, kwargs) except SystemExit: raise except KeyboardInterrupt: diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index edceece4b68dc..2afd99f99b353 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -2,8 +2,9 @@ import os from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +import cloudpickle import msgspec import vllm.envs as envs @@ -172,7 +173,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rank=rank) + rpc_rank=rank) else: worker = ray.remote( num_cpus=0, @@ -181,7 +182,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rank=rank) + rpc_rank=rank) worker_metadata.append( RayWorkerMetaData(worker=worker, created_rank=rank)) rank += 1 @@ -204,7 +205,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config, rank=0) + vllm_config=self.vllm_config, rpc_rank=0) worker_metadata.pop(i) break @@ -410,7 +411,7 @@ def execute_model( def _run_workers( self, - method: str, + method: Union[str, Callable], *args, async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, @@ -426,6 +427,11 @@ def _run_workers( rather than blocking on the results. - args/kwargs: All workers share the same args/kwargs """ + if isinstance(method, str): + sent_method = method + else: + sent_method = cloudpickle.dumps(method) + del method if self.use_ray_spmd_worker: assert not async_run_tensor_parallel_workers_only, ( "async_run_tensor_parallel_workers_only is not supported for " @@ -440,7 +446,7 @@ def _run_workers( if async_run_tensor_parallel_workers_only: ray_workers = self.non_driver_workers ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) + worker.execute_method.remote(sent_method, *args, **kwargs) for worker in ray_workers ] @@ -455,7 +461,7 @@ def _run_workers( if not self.use_ray_spmd_worker: # Start the driver worker after all the ray workers. driver_worker_output = [ - self.driver_worker.execute_method(method, *args, **kwargs) + self.driver_worker.execute_method(sent_method, *args, **kwargs) ] # Get the results of the ray workers. diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index da1d77343cf3b..a5c4dcf0ec7f9 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,8 +1,14 @@ -from typing import Any, Dict, List, Optional, Tuple +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import torch +import torch.distributed as dist + +import vllm.envs as envs from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + run_method) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -16,7 +22,7 @@ def _init_executor(self) -> None: """Initialize the worker and load the model. """ self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, - rank=0) + rpc_rank=0) distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) local_rank = 0 @@ -34,18 +40,13 @@ def _init_executor(self) -> None: self.collective_rpc("load_model") def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} - try: - func = getattr(self.driver_worker, method) - except AttributeError: - raise NotImplementedError(f"Method {method} is not implemented.") \ - from None - answer = func(*args, **kwargs) + answer = run_method(self.driver_worker, method, args, kwargs) return [answer] def check_health(self) -> None: @@ -55,3 +56,77 @@ def check_health(self) -> None: UniProcExecutorAsync = UniProcExecutor + + +class ExecutorWithExternalLauncher(UniProcExecutor): + """An executor that uses external launchers to launch engines, + specially designed for torchrun-compatible launchers, for + offline inference with tensor parallelism. + + see https://github.com/vllm-project/vllm/issues/11400 for + the motivation, and examples/offline_inference/torchrun_example.py + for the usage example. + + The key idea: although it is tensor-parallel inference, we only + create one worker per executor, users will launch multiple + engines with torchrun-compatible launchers, and all these engines + work together to process the same prompts. When scheduling is + deterministic, all the engines will generate the same outputs, + and they don't need to synchronize the states with each other. + """ + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model. + """ + assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \ + ("ExecutorWithExternalLauncher does not " + "support pipeline parallelism.") + assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ + ("ExecutorWithExternalLauncher needs deterministic " + "execution, so it" + "does not support delay_factor in scheduling") + assert not envs.VLLM_USE_V1, \ + ("V1 architecture cannot guarantee deterministic execution, " + "so it is not supported in ExecutorWithExternalLauncher.") + self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config, + rpc_rank=0) + # engines are launched in torchrun-compatible launchers + # so we can use the env:// method. + # required env vars: + # - RANK + # - MASTER_ADDR + # - MASTER_PORT + distributed_init_method = "env://" + rank = int(os.environ["RANK"]) + local_rank = rank + is_driver_worker = True + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker, + ) + self.collective_rpc("init_worker", args=([kwargs], )) + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """ + Determine the number of available KV blocks. + Add an additional all_reduce to get the min across all ranks. + Note that even if we have the same `gpu_memory_utilization` and + `swap_space`, the available memory in every rank might still + differ because NCCL can take different amounts of memory in + different ranks. Therefore, it is necessary to test if all ranks + agree on the same KV cache configuration. + """ + a, b = super().determine_num_available_blocks() + from vllm.distributed.parallel_state import get_world_group + cpu_group = get_world_group().cpu_group + a_tensor = torch.tensor([a], device="cpu", dtype=torch.int64) + b_tensor = torch.tensor([b], device="cpu", dtype=torch.int64) + dist.all_reduce(a_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + dist.all_reduce(b_tensor, group=cpu_group, op=dist.ReduceOp.MIN) + return a_tensor.item(), b_tensor.item() diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index dd981ffce8833..e6f26d2b74b2f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -940,8 +940,8 @@ def soft_cap(self): return self.base_layer.soft_cap @property - def use_gather(self): - return self.base_layer.use_gather + def use_all_gather(self): + return self.base_layer.use_all_gather @property def org_vocab_size(self): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5b7225bdc8f37..9809405ca9a61 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,5 +1,4 @@ import copy -import json import math import os import re @@ -180,8 +179,8 @@ def from_local_checkpoint( cls, lora_dir: str, expected_lora_modules: List[str], + peft_helper: PEFTHelper, *, - max_position_embeddings: Optional[int] = None, lora_model_id: Optional[int] = None, device: str = "cuda", dtype: Optional[torch.dtype] = None, @@ -196,9 +195,7 @@ def from_local_checkpoint( lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be replaced by lora. - max_position_embeddings: Max position embedding length. Used to - scaling the largest context length. If None, the lora model's - context length is not scaled. + peft_helper: Loaded lora configuration information. lora_model_id: Lora model id. If not given, automatically set by a global counter. device: Device where the lora model is loaded. @@ -207,18 +204,13 @@ def from_local_checkpoint( Returns: Loaded LoRA Model. """ - lora_config_path = os.path.join(lora_dir, "adapter_config.json") lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") new_embeddings_tensor_path = os.path.join( lora_dir, "new_embeddings.safetensors") new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") - with open(lora_config_path) as f: - config = json.load(f) - config["vllm_max_position_embeddings"] = max_position_embeddings - peft_helper = PEFTHelper.from_dict(config) unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index dacfb9ebd1480..b9c506f6e0bfd 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,9 +1,12 @@ # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py +import json import math +import os from dataclasses import MISSING, dataclass, field, fields -from typing import Literal, Optional, Union +from typing import List, Literal, Optional, Union +from vllm.config import LoRAConfig from vllm.logger import init_logger logger = init_logger(__name__) @@ -11,6 +14,12 @@ @dataclass class PEFTHelper: + """ + A helper class for PEFT configurations, specifically designed for LoRA. + This class handles configuration validation, compatibility checks for + various LoRA implementations. + """ + # Required fields r: int lora_alpha: int @@ -29,20 +38,18 @@ class PEFTHelper: vllm_max_position_embeddings: Optional[int] = field(default=False) vllm_long_context_scaling_factor: Optional[float] = field(default=None) - def _validate_features(self): + def _validate_features(self) -> List[str]: + """ + Check if there are any unsupported Lora features. + """ error_msg = [] - if self.modules_to_save: error_msg.append("vLLM only supports modules_to_save being None.") - if self.use_dora: error_msg.append("vLLM does not yet support DoRA.") - - if error_msg: - raise ValueError(f"{', '.join(error_msg)}") + return error_msg def __post_init__(self): - self._validate_features() if self.use_rslora: logger.info_once("Loading LoRA weights trained with rsLoRA.") self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) @@ -78,3 +85,29 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper": for k, v in config_dict.items() if k in class_fields } return cls(**filtered_dict) + + @classmethod + def from_local_dir(cls, lora_path: str, + max_position_embeddings: Optional[int]) -> "PEFTHelper": + lora_config_path = os.path.join(lora_path, "adapter_config.json") + + with open(lora_config_path) as f: + config = json.load(f) + config["vllm_max_position_embeddings"] = max_position_embeddings + return cls.from_dict(config) + + def validate_legal(self, lora_config: LoRAConfig) -> None: + """ + Validates the LoRA configuration settings against application + constraints and requirements. + """ + error_msg = self._validate_features() + if self.r > lora_config.max_lora_rank: + error_msg.append( + f"LoRA rank {self.r} is greater than max_lora_rank" + f" {lora_config.max_lora_rank}.") + if self.bias != "none" and not lora_config.bias_enabled: + error_msg.append( + "Adapter bias cannot be used without bias_enabled.") + if error_msg: + raise ValueError(f"{' '.join(error_msg)}") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index eec462743fe9d..a64296f7fd902 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -12,6 +12,7 @@ from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path @@ -95,6 +96,13 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: expected_lora_modules = list(set(expected_lora_modules)) lora_path = get_adapter_absolute_path(lora_request.lora_path) + peft_helper = PEFTHelper.from_local_dir( + lora_path, self.max_position_embeddings) + + # Validates the LoRA configuration against requirements before + # loading weights, throwing an exception if validation fails. + peft_helper.validate_legal(self.lora_config) + # For some models like Qwen2VL, we need to use hf_to_vllm_mapper # to ensure correct loading of lora weights. hf_to_vllm_mapper = None @@ -105,7 +113,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, - max_position_embeddings=self.max_position_embeddings, + peft_helper=peft_helper, lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, @@ -120,15 +128,14 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel: # - No adapter found to download from huggingface (or in # offline mode) # - No local adapter files found at `lora_request.lora_path` + # For NotFoundError raise ValueError( f"Loading lora {lora_request.lora_name} failed: No adapter " f"found for {lora_path}") from e except Exception as e: - raise RuntimeError(f"Loading lora {lora_path} failed") from e - if lora.rank > self.lora_config.max_lora_rank: - raise ValueError( - f"LoRA rank {lora.rank} is greater than max_lora_rank " - f"{self.lora_config.max_lora_rank}.") + # For BadRequestError + raise e + if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " f"is greater than lora_extra_vocab_size " diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index f10a8fb8e03cf..2d8594cb8aafa 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -298,8 +298,11 @@ def __call__(self, input_ids: list[int], # token_bitmask is a CPU tensor for use with accept_token and # fill_next_token_bitmask so we move it to the device of scores device_type = scores.device.type + dtype = scores.dtype if device_type != "cuda": - scores = scores.to("cpu").unsqueeze(0) + # xgrammar on cpu only supports float32 scores + # see: https://github.com/mlc-ai/xgrammar/blob/c1b64920cad24f44f235778c1c00bb52d57da01a/python/xgrammar/kernels/apply_token_bitmask_inplace_cpu.py#L22 + scores = scores.to("cpu").float().unsqueeze(0) # Note: In this method, if the tensors have different dimensions # on CPU device fails, but on GPU it runs without error. Hence the @@ -307,7 +310,7 @@ def __call__(self, input_ids: list[int], xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) if device_type != "cuda": - scores = scores.to(device_type).squeeze() + scores = scores.to(dtype).to(device_type).squeeze() return scores diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index af7894b42c560..fb9684ac1c184 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -30,8 +30,10 @@ class FatreluAndMul(CustomOp): def __init__(self, threshold: float = 0.): super().__init__() self.threshold = threshold - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike(): self.op = torch.ops._C.fatrelu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 @@ -100,11 +102,13 @@ class MulAndSilu(CustomOp): def __init__(self): super().__init__() - if current_platform.is_cuda_alike() or current_platform.is_cpu(): + if current_platform.is_cuda_alike(): self.op = torch.ops._C.mul_and_silu elif current_platform.is_xpu(): from vllm._ipex_ops import ipex_ops self.op = ipex_ops.silu_and_mul + elif current_platform.is_cpu(): + self._forward_method = self.forward_native def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 00ae64bbe6388..52263e96fb9f9 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -344,11 +344,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if output_dim is not None and not use_bitsandbytes_4bit: + if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, @@ -546,6 +548,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: shard_size = loaded_weight.shape[output_dim] shard_offset = loaded_weight.shape[output_dim] * \ @@ -554,9 +561,7 @@ def weight_loader(self, param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. @@ -941,6 +946,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: orig_qkv_offsets = { "q": (0, self.num_heads * self.head_size), @@ -964,9 +974,7 @@ def weight_loader(self, shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -1070,6 +1078,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) @@ -1085,9 +1097,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: + if input_dim is not None and not is_sharded_weight: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 2bc7e458494f7..42decde1d0f79 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -6,6 +6,7 @@ import torch.nn as nn import vllm.envs as envs +from vllm.config import get_current_vllm_config from vllm.distributed import (tensor_model_parallel_all_gather, tensor_model_parallel_gather) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -44,8 +45,10 @@ def __init__(self, self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. - self.use_gather = not current_platform.is_tpu( - ) and not envs.VLLM_USE_V1 + parallel_config = get_current_vllm_config().parallel_config + self.use_all_gather = current_platform.is_tpu() \ + or envs.VLLM_USE_V1 \ + or parallel_config.distributed_executor_backend == "external_launcher" # noqa def forward( self, @@ -88,16 +91,17 @@ def _get_logits( logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) - if self.use_gather: - # None may be returned for rank > 0 - logits = tensor_model_parallel_gather(logits) - else: + + if self.use_all_gather: # Gather is not supported for some devices such as TPUs. # Use all-gather instead. # NOTE(woosuk): Here, the outputs of every device should not be None # because XLA requires strict SPMD among all devices. Every device # should execute the same operations after gathering the logits. logits = tensor_model_parallel_all_gather(logits) + else: + # None may be returned for rank > 0 + logits = tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., :self.org_vocab_size] diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 4969ee559522e..26dd5df4e55b2 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -247,6 +247,15 @@ def create_weights( def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + weight, weight_scale, _ = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=layer.weight, + weight_scale=layer.weight_scale_inv, + input_scale=layer.input_scale) + layer.weight = Parameter(weight, requires_grad=False) + layer.weight_scale_inv = Parameter(weight_scale, + requires_grad=False) return layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) @@ -495,6 +504,30 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + w13_weight, w13_weight_scale_inv, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale_inv, + layer.w13_input_scale) + w2_weight, w2_weight_scale_inv, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale_inv, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False) return # If checkpoint is fp16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index f3c3e130e4161..b6882cc7c837c 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -5,6 +5,8 @@ import triton import triton.language as tl +from vllm.platforms import current_platform + def apply_w8a8_block_fp8_linear( input: torch.Tensor, @@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: torch.dtype = torch.float8_e4m3fn + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) finfo = torch.finfo(dtype) min_val, max_val = x.aminmax() amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) @@ -125,7 +130,7 @@ def per_token_group_quant_fp8( x: torch.Tensor, group_size: int, eps: float = 1e-10, - dtype: torch.dtype = torch.float8_e4m3fn, + dtype: Optional[torch.dtype] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -140,6 +145,9 @@ def per_token_group_quant_fp8( Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " f"by `group_size` {group_size}") diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 3fcd81a3c4213..d071cfe888f05 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -841,6 +841,37 @@ def get_input_positions( ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" + llm_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + input_tokens, + image_grid_thw, + video_grid_thw, + image_token_id, + video_token_id, + vision_start_token_id, + vision_end_token_id, + spatial_merge_size, + context_len, + seq_len, + ) + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_input_positions_tensor( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> Tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -916,7 +947,7 @@ def get_input_positions( len(input_tokens)).item() llm_positions = llm_positions[:, context_len:seq_len] - return llm_positions.tolist(), mrope_position_delta + return llm_positions, mrope_position_delta @staticmethod def get_next_input_positions( @@ -930,6 +961,17 @@ def get_next_input_positions( seq_len + mrope_position_delta)) for _ in range(3) ] + @staticmethod + def get_next_input_positions_tensor( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> torch.Tensor: + return torch.arange( + mrope_position_delta + context_len, + mrope_position_delta + seq_len, + ).expand(3, -1) + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 9fe0db62435a0..f697c3245f098 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -182,6 +182,9 @@ class Source: fall_back_to_pt: bool = True """Whether .pt weights can be used.""" + allow_patterns_overrides: Optional[list[str]] = None + """If defined, weights will load exclusively using these patterns.""" + def __init__(self, load_config: LoadConfig): super().__init__(load_config) if load_config.model_loader_extra_config: @@ -218,6 +221,7 @@ def _prepare_weights( model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool, + allow_patterns_overrides: Optional[list[str]], ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. @@ -249,6 +253,9 @@ def _prepare_weights( if fall_back_to_pt: allow_patterns += ["*.pt"] + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + if not is_local: hf_folder = download_weights_from_hf( model_name_or_path, @@ -298,7 +305,8 @@ def _get_weights_iterator( ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( - source.model_or_path, source.revision, source.fall_back_to_pt) + source.model_or_path, source.revision, source.fall_back_to_pt, + source.allow_patterns_overrides) if self.load_config.load_format == LoadFormat.NPCACHE: # Currently np_cache only support *.bin checkpoints assert use_safetensors is False @@ -340,6 +348,8 @@ def _get_all_weights( prefix="", fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), + allow_patterns_overrides=getattr(model, "allow_patterns_overrides", + None), ) yield from self._get_weights_iterator(primary_weights) @@ -353,7 +363,8 @@ def _get_all_weights( def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision, - fall_back_to_pt=True) + fall_back_to_pt=True, + allow_patterns_overrides=None) def load_model(self, vllm_config: VllmConfig) -> nn.Module: device_config = vllm_config.device_config @@ -1105,15 +1116,22 @@ def _load_weights(self, model_config: ModelConfig, weight_name, index, ) in self.modules_mapping.inverse_packed_mapping.items(): - shard_pos = quant_param_name.find(shard_name) # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight - if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": + shard_pos = quant_param_name.find(shard_name) + can_correct_rename = (shard_pos > 0) and ( + quant_param_name[shard_pos - 1] == ".") + # If the quant_param_name is packed, it won't occur in the + # param_dict before renaming. + new_quant_param_name = quant_param_name.replace( + shard_name, weight_name) + need_rename = (quant_param_name not in param_dict) \ + and (new_quant_param_name in param_dict) + if can_correct_rename and need_rename: shard_index = index - quant_param_name = quant_param_name.replace( - shard_name, weight_name) + quant_param_name = new_quant_param_name break # Models like Clip/Siglip may skip some layers in initialization, diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index fbd4937112e11..5b4757072353f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs, stream.write(encryption_params.key) engine = LLMEngine.from_engine_args(engine_args) - if tensorizer_config._is_sharded: - # if the engine is a distributed engine (for tensor parallel) then each - # worker shard needs to serialize its part of the model. - engine.model_executor._run_workers( - "save_tensorized_model", - tensorizer_config=tensorizer_config, - ) - else: - # with a single worker, we can get to the underlying model directly - serialize_vllm_model( - engine.model_executor.driver_worker.model_runner.model, - tensorizer_config, - ) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 7e37ce3086e6b..d5f9b4d19e5ca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -41,7 +41,7 @@ from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -605,9 +605,50 @@ def forward( return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), + ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "rotary_pos_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -660,52 +701,9 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - # Merge two ColumnParallelLinear into one MergedColumnParallelLinear - merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { - "transformer.vision.linear_proj.merged_proj.weight": { - "transformer.vision.linear_proj.gate_proj.weight": None, - "transformer.vision.linear_proj.dense_h_to_4h.weight": None, - } - } - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - is_weight_to_be_merge = False - for _, merged_weight_dict in merged_weights_dict.items(): - if name in merged_weight_dict: - assert merged_weight_dict[name] is None - merged_weight_dict[name] = loaded_weight - is_weight_to_be_merge = True - if is_weight_to_be_merge: - continue - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - for combined_name, merged_weight_dict in merged_weights_dict.items(): - if combined_name in params_dict: - param = params_dict[combined_name] - combined_weight = torch.cat(list(merged_weight_dict.values()), - dim=0) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, combined_weight) - loaded_params.add(combined_name) - return loaded_params + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLM(ChatGLMBaseModel): @@ -726,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel): class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): + packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], @@ -777,7 +776,7 @@ def __new__( ) -> None: config = vllm_config.model_config.hf_config # Initialize VL - if hasattr(config, "visual"): + if hasattr(config, "vision_config"): return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 99fa941c055d2..4d3d1c329a2c0 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,7 +1,7 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math -from functools import cached_property, partial +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -9,7 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoProcessor, BatchFeature, ProcessorMixin +from transformers import BatchFeature from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -31,6 +31,8 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig) +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP @@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(DeepseekVLV2Config) - def get_hf_processor(self) -> ProcessorMixin: - # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2 - # in the future, because it's flasky and lack of maintenance. - try: - from deepseek_vl2.models.processing_deepseek_vl_v2 import ( - DeepseekVLV2Processor, select_best_resolution) - AutoProcessor.register("DeepseekVLV2Processor", - DeepseekVLV2Processor) - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "You need to `pip install " - "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` " - "to use this model") from exc - - processor = self.ctx.get_hf_processor(DeepseekVLV2Processor) - processor.select_best_resolution = partial( - select_best_resolution, - candidate_resolutions=processor.candidate_resolutions) - return processor + def get_hf_processor(self) -> DeepseekVLV2Processor: + return self.ctx.get_hf_processor(DeepseekVLV2Processor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -224,31 +209,21 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - outputs = self.info.ctx.call_hf_processor( + processed_outputs = self.info.ctx.call_hf_processor( self.info.get_hf_processor(**mm_kwargs), dict(prompt=prompt, **mm_data), mm_kwargs, ) - - # Deepseek-vl2 processor don't return BatchFeature, - # we need to manually create it - processed_outputs = dict(input_ids=outputs["input_ids"]) - processed_outputs = BatchFeature(data=dict(processed_outputs), - tensor_type="pt") - - # Remove batch dimension from processor outputs, - # because we will try batch to create NestedTensors target_dtype = self.info.ctx.model_config.dtype - pixel_values = outputs["images"].to(target_dtype).squeeze(0) - images_spatial_crop = outputs["images_spatial_crop"].squeeze(0) + pixel_values = processed_outputs.pop("pixel_values").to( + target_dtype) + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] patches_per_image = [ x.prod().item() + 1 for x in images_spatial_crop ] - - # Rename `images` -> `pixel_values` to avoid confusion - processed_outputs["pixel_values"] = list( - pixel_values.split(patches_per_image)) - processed_outputs["images_spatial_crop"] = images_spatial_crop + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values else: tokenizer = self.info.get_tokenizer() processed_outputs = tokenizer(prompt, @@ -356,13 +331,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"Only 2D tile_tag is supported currently, got: {self.tile_tag}" ) + if self.text_config.topk_method == "noaux_tc": + architectures = ["DeepseekV3ForCausalLM"] + elif not self.text_config.use_mla: + architectures = ["DeepseekForCausalLM"] + else: + architectures = ["DeepseekV2ForCausalLM"] + self.language_model = init_vllm_registered_model( vllm_config=vllm_config, hf_config=self.text_config, prefix=maybe_prefix(prefix, "language"), - architectures=["DeepseekV3ForCausalLM"] - if self.text_config.topk_method == "noaux_tc" else - ["DeepseekV2ForCausalLM"], + architectures=architectures, ) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py new file mode 100644 index 0000000000000..b93a68680375d --- /dev/null +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -0,0 +1,151 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llama model for fairseq2 weights.""" + +from typing import Iterable, Set, Tuple + +import torch +from torch.nn import Parameter + +from vllm.config import VllmConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import set_weight_attrs +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import AutoWeightsLoader, WeightsMapper + + +class Fairseq2LlamaForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + # For the model loader to read only the relevant checkpoint files + self.allow_patterns_overrides = [ + # either the full checkpoint + "model.pt", + # or the tp-sharded checkpoint of the current rank + f"model.{self.tp_rank}.pt", + ] + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + # fairseq2's serialization adds a wrapper to usual .pt state_dict's: + # { "model_key": my_model_name, "my_model_name": state_dict } + # which we first need to unpack + weights_wrapped = dict(weights) + weights = weights_wrapped[ + weights_wrapped["model_key"]].items() # type: ignore + + # remap keys + fs2_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder_frontend.embed.": "model.embed_tokens.", + "decoder.": "model.", + "final_proj.": "lm_head.", + }, + orig_to_new_substr={ + ".self_attn_layer_norm.": ".input_layernorm.", + ".ffn_layer_norm.": ".post_attention_layernorm.", + ".self_attn.output_proj.": ".self_attn.o_proj.", + ".ffn.gate_proj.": ".mlp.gate_proj.", + ".ffn.inner_proj.": ".mlp.up_proj.", + ".ffn.output_proj.": ".mlp.down_proj.", + ".layer_norm.": ".norm.", + }, + ) + weights = fs2_to_vllm_mapper.apply(weights) + + params = dict(self.named_parameters()) + + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights( + (self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights)) + + def flag_sharded_weights(self, params: dict[str, Parameter]): + """Sets the `is_sharded_weight` flag to True for all sharded weights""" + for name, param in params.items(): + modules = name.split(".") + if "norm" in name and len(param.size()) < 2: + # layer norms are not sharded + continue + elif any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # for now we repeat embedding layers for compatibility + continue + else: + # all other layers are sharded + set_weight_attrs(param, {"is_sharded_weight": True}) + + def reshape_fairseq2_weights( + self, + name: str, + loaded_weight: torch.Tensor, + params: dict[str, Parameter], + ) -> Tuple[str, torch.Tensor]: + """Reshape fairseq2's weights.""" + + def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: + attn_in = self.config.head_dim * n_heads + # check for a sharded weight on dim 0 + if attn_in // self.tp_size == w.size()[0]: + attn_in //= self.tp_size + n_heads //= self.tp_size + attn_out = self.config.hidden_size + return (w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, + 2).reshape(attn_in, attn_out)) + + modules = name.split(".") + + # rotary embeds should be sliced + if "k_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + + elif "q_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + # We make the loaded weights compatible with both + # full checkpoints and tp sharded checkpoints. + # Embeddings are repeated to fit the vocab size. + # Other weights are flagged for the weight_loader calls. + if any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # Embeddings are sharded on dim 0 + dim = 0 + # In fairseq2, vocab size has to be divisible by tp_size + # so we don't worry about padding + if self.tp_size > 1 and loaded_weight.shape[ + dim] < self.config.vocab_size: + assert loaded_weight.shape[ + dim] * self.tp_size == self.config.vocab_size, \ + "vocab_size should be divisible by tp_size." + repeats = [1] * len(loaded_weight.size()) + repeats[dim] = self.tp_size + # repeat to match vocab size and to be easily 'narrow'able + loaded_weight = loaded_weight.repeat(repeats) + set_weight_attrs(params[name], {"is_sharded_weight": False}) + # if embeddings are sharded, the rest is too + if "embed_tokens" in modules: + self.flag_sharded_weights(params) + + return name, loaded_weight diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 39a5736eb199b..51922e6f2d03d 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -42,7 +42,8 @@ def forward(self, images: torch.Tensor) -> torch.Tensor: torch.Tensor Transformed tensor with shape (B, L, D) """ - images = images.to(self.proj.weight.device) + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) x = self.proj(images) x = x.flatten(2).transpose(1, 2) cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 722fff98d5c19..6cceded43a79d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,9 +5,11 @@ import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -716,6 +718,27 @@ def load_weights(self, weights: Iterable[Tuple[str, return loader.load_weights(weights) +class MantisProcessingInfo(LlavaProcessingInfo): + + def get_hf_processor(self): + hf_config = self.get_hf_config() + vision_info = self.get_vision_encoder_info() + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + vision_feature_select_strategy = None + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + + return self.ctx.get_hf_processor( + LlavaProcessor, + patch_size=vision_info.get_patch_size(), + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + class MantisMultiModalProcessor(LlavaMultiModalProcessor): def apply( @@ -794,7 +817,7 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, - info=LlavaProcessingInfo, + info=MantisProcessingInfo, dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 78a47e64d9afc..6faa79f65d8de 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import PromptReplacement from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors @@ -145,6 +145,10 @@ def _get_num_unpadded_features( return (unpadded_features, newline_features) + def get_image_size_with_most_features(self) -> ImageSize: + # NOTE: This hardcoded value is found via processor tests + return ImageSize(width=1153, height=944) + def _get_num_frame_tokens( self, *, @@ -550,10 +554,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key == "pixel_values" and "images" not in modalities: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) - if input_key == "pixel_values_videos" and "videos" not in modalities: # noqa E501 + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: modalities["videos"] = self._parse_and_validate_video_input( **kwargs) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index d015f60c6d065..82de1c3574090 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -256,7 +256,15 @@ def forward( return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Qwen2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0dff9595c6c08..47d56175261e4 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -36,8 +36,9 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -153,29 +154,24 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, Any], ) -> BatchFeature: - mm_data = dict(mm_data) - audios = mm_data.pop("audios", []) - - if audios: - mm_data["audios"] = audios - - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) - mm_kwargs = dict( - **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass + # Text-only input not supported in composite processor + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) - processed_outputs = super()._call_hf_processor( + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, ) - return processed_outputs - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -192,8 +188,14 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - placeholder = hf_config.audio_token_index + processor = self.info.get_hf_processor() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: @@ -214,12 +216,16 @@ def get_replacement_qwen2_audio(item_idx: int): f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return [placeholder] * num_placeholders + return "".join([ + audio_bos_token, + audio_token * num_placeholders, + audio_eos_token, + ]) return [ PromptReplacement( modality="audio", - target=[placeholder], + target=audio_token, replacement=get_replacement_qwen2_audio, ) ] @@ -234,6 +240,26 @@ def _always_apply_prompt_replacements(self) -> bool: # tokens than the number of audio items) return not hasattr(self.info.get_hf_processor(), "audio_token") + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Only <|AUDIO|> tokens should be considered as placeholders, + # so we ignore the audio_bos_token and audio_eos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + @MULTIMODAL_REGISTRY.register_processor( Qwen2AudioMultiModalProcessor, diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 988d682d36be3..593ce4857af0f 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -32,7 +32,7 @@ def forward(self, input): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): +class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -60,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -74,14 +73,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, quant_config=quant_config), ReLU(), - RowParallelLinear(config.hidden_size, 1, + RowParallelLinear(config.hidden_size, + config.num_labels, quant_config=quant_config), ) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + self._pooler: SimplePooler self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -115,3 +111,31 @@ def load_weights(self, weights: Iterable[Tuple[str, loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) + + +class Qwen2ForRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 1 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False) + + +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 2 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.STEP, + normalize=False, + softmax=True, + step_tag_id=151651, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d00e5d362c8bc..34d5c8ad089a3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -67,11 +67,15 @@ from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix) + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) from .vision import get_vit_attn_backend logger = init_logger(__name__) +# For profile run +_MAX_FRAMES_PER_VIDEO = 16 + # === Vision Inputs === # @@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + (concatenation of all videos' feature tensors). Tensor shape: `(num_image_features, hidden_size)` - `num_image_features` varies based on @@ -611,6 +615,7 @@ def forward( # adapter x = self.merger(x) + return x def load_weights(self, weights: Iterable[Tuple[str, @@ -874,8 +879,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int: max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - - num_frames = max(max_total_frames // max(max_videos, 1), 1) + num_frames = min(max(max_total_frames // max(max_videos, 1), 1), + _MAX_FRAMES_PER_VIDEO) # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 if num_frames > 1 and num_frames % 2 == 1: @@ -955,13 +960,14 @@ def _get_prompt_replacements( "image": hf_processor.image_token, "video": hf_processor.video_token, } + merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod() // merge_length + num_tokens = grid_thw.prod().item() // merge_length return placeholder[modality] * num_tokens return [ @@ -1047,11 +1053,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: Qwen2VLConfig = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - assert not cache_config.enable_prefix_caching, \ - "Qwen2-VL currently does not support prefix caching" self.config = config self.multimodal_config = multimodal_config @@ -1173,59 +1176,82 @@ def _parse_and_validate_video_input( video_embeds=video_embeds, video_grid_thw=video_grid_thw) - def _process_image_input(self, - image_input: Qwen2VLImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 - def _process_video_input(self, - video_input: Qwen2VLVideoInputs) -> torch.Tensor: if video_input["type"] == "video_embeds": - return video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_input["video_grid_thw"]) - return video_embeds + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size - def _merge_multimodal_embeddings( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int, - ) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities def get_multimodal_embeddings( self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings return multimodal_embeddings @@ -1237,21 +1263,9 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.image_token_id, - ) - if modality == "video": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.video_token_id, - ) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a71f7f7029c7d..8d2719ca2d00d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,6 +47,7 @@ "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), @@ -126,6 +127,7 @@ "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 587f18ccaf98f..9301422383696 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -137,7 +137,7 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: # Text-only input not supported in composite processor - if not mm_data: + if not mm_data or not mm_data.get("audios", []): prompt_ids = self.info.get_tokenizer().encode(prompt) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") @@ -146,13 +146,6 @@ def _call_hf_processor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - if not audios: - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 44d16b40a5737..c3d8401da3c5b 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -94,7 +94,8 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm() or current_platform.is_out_of_tree(): + elif current_platform.is_cpu() or current_platform.is_rocm( + ) or current_platform.is_out_of_tree(): # ROCM doesn't support xformers selected_backend = _Backend.TORCH_SDPA else: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2e2bd5c787558..55f2f08855d7e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -139,28 +139,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" - world_size = parallel_config.world_size - tensor_parallel_size = parallel_config.tensor_parallel_size - - from vllm.utils import (cuda_device_count_stateless, - update_environment_variables) - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - assert tensor_parallel_size <= cuda_device_count, ( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - assert world_size <= cuda_device_count, ( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") - cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index e5fa4f0e4a2f6..ff54174f634af 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,9 +1,6 @@ import logging -import os from typing import Callable, Dict -import torch - import vllm.envs as envs logger = logging.getLogger(__name__) @@ -50,34 +47,6 @@ def load_general_plugins(): processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ - - # all processes created by vllm will load plugins, - # and here we can inject some common environment variables - # for all processes. - - # see https://github.com/vllm-project/vllm/issues/10480 - os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - # see https://github.com/vllm-project/vllm/issues/10619 - torch._inductor.config.compile_threads = 1 - - from vllm.platforms import current_platform - - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 - torch._dynamo.config.disable = True - if current_platform.is_hpu(): - # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) - # does not support torch.compile - # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for - # torch.compile support - is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' - if is_lazy: - torch._dynamo.config.disable = True - # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) - # requires enabling lazy collectives - # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 - os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - global plugins_loaded if plugins_loaded: return diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 33babfebdca1e..29c0edd0ee535 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,7 +1,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -128,7 +128,7 @@ def export_summary_stats_table_csv(self, filename: str): ]) df.to_csv(filename) - def convert_stats_to_dict(self) -> str: + def convert_stats_to_dict(self) -> dict[str, Any]: return { "metadata": { "num_running_seqs": self.num_running_seqs @@ -227,7 +227,7 @@ def _total_cuda_time(self): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: Dict[str, self.StatsTreeNode] = {} + summary_dict: Dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index bb6b99135580e..e906b1789cde8 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -2,6 +2,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -10,6 +11,10 @@ from vllm.spec_decode.top1_proposer import Top1Proposer +class _DummyModel(nn.Module): + pass + + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. @@ -36,7 +41,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int, def init_device(self): self.device = torch.device(f"{self.device_type}:{self.local_rank}") - self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( @@ -45,6 +49,12 @@ def init_device(self): vocab_size=self.vocab_size, ) + def load_model(self) -> None: + pass # Dummy + + def get_model(self) -> nn.Module: + return _DummyModel() + def sampler_output( self, execute_model_req: ExecuteModelRequest, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8896b7dbc6b8a..c6ff5e52f9388 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, @@ -15,6 +16,10 @@ logger = init_logger(__name__) +class _DummyModel(nn.Module): + pass + + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -139,6 +144,13 @@ def get_spec_proposals( return self._worker.get_spec_proposals( execute_model_req, seq_ids_with_bonus_token_in_last_step) + def get_model(self) -> nn.Module: + if self._is_dummy: + return _DummyModel() + + with self._patch_tensor_parallel_group(): + return self._worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 540d118d65ecb..0d66ede3d907a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch +import torch.nn as nn from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict @@ -403,6 +404,9 @@ def initialize_cache(self, num_gpu_blocks: int, self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.scorer_worker.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index c97acffa1a719..f57dfded0a62f 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -22,10 +22,10 @@ from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, - DbrxConfig, DeepseekVLV2Config, - EAGLEConfig, ExaoneConfig, - H2OVLChatConfig, +from vllm.transformers_utils.configs import (AriaConfig, ChatGLMConfig, + Cohere2Config, DbrxConfig, + DeepseekVLV2Config, EAGLEConfig, + ExaoneConfig, H2OVLChatConfig, InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, @@ -52,6 +52,7 @@ } _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { + "aria": AriaConfig, "chatglm": ChatGLMConfig, "cohere2": Cohere2Config, "dbrx": DbrxConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f065c56124605..807ef4fbfd0c0 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,3 +1,4 @@ +from vllm.transformers_utils.configs.aria import AriaConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig @@ -23,6 +24,7 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ + "AriaConfig", "ChatGLMConfig", "Cohere2Config", "DbrxConfig", diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py index d253da0d96a34..f4b531225b5d0 100644 --- a/vllm/transformers_utils/configs/aria.py +++ b/vllm/transformers_utils/configs/aria.py @@ -1,7 +1,32 @@ +# Copyright 2024 Rhymes AI. All rights reserved. +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from typing import Mapping + +from transformers import PretrainedConfig from transformers.models.idefics2.configuration_idefics2 import ( Idefics2VisionConfig) from transformers.models.llama.configuration_llama import LlamaConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + class AriaVisionConfig(Idefics2VisionConfig): model_type = "aria_vision_model" @@ -45,3 +70,96 @@ def __init__( self.moe_num_experts = moe_num_experts self.moe_topk = moe_topk self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + """ + Configuration class for Aria model. + This class handles the configuration for both vision and text components of + the Aria model, + as well as additional parameters for image token handling and projector + mapping. + + Args: + vision_config (AriaVisionConfig or dict): Configuration for the vision + component. + text_config (AriaMoELMConfig or dict): Configuration for the text + component. + projector_patch_to_query_dict (dict): Mapping of patch sizes to query + dimensions. + ignore_index (int): Index to ignore in loss calculation. + image_token_index (int): Index used to represent image tokens. + **kwargs: Additional keyword arguments passed to the parent class. + Attributes: + model_type (str): Type of the model, set to "aria". + is_composition (bool): Whether the model is a composition of multiple + components. + ignore_index (int): Index to ignore in loss calculation. + image_token_index (int): Index used to represent image tokens. + projector_patch_to_query_dict (dict): Mapping of patch sizes to query + dimensions. + vision_config (AriaVisionConfig): Configuration for the vision + component. + text_config (AriaMoELMConfig): Configuration for the text component. + """ + + model_type = "aria" + is_composition = False + + def __init__( + self, + vision_config: AriaVisionConfig = AriaVisionConfig(), # noqa: B008 + text_config: AriaMoELMConfig = AriaMoELMConfig(), # noqa: B008 + projector_patch_to_query_dict: Mapping[int, int] = { + 1225: 128, + 4900: 256, + }, + ignore_index=-100, + image_token_index=32000, + tie_word_embeddings=False, + **kwargs, + ): + super().__init__(**kwargs) + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.tie_word_embeddings = tie_word_embeddings + attn_implementation = kwargs.pop("attn_implementation", None) + + # Set the default attention implementation to flash_attention_2 if not + # specified + self._attn_implementation = ("flash_attention_2" + if attn_implementation is None else + attn_implementation) + + # Convert the keys and values of projector_patch_to_query_dict to + # integers + # This ensures consistency even if they were provided as strings + self.projector_patch_to_query_dict = { + int(k): int(v) + for k, v in projector_patch_to_query_dict.items() + } + + if isinstance(vision_config, dict) and "model_type" in vision_config: + vision_config = AriaVisionConfig(**vision_config) + if attn_implementation is None: + vision_attn_implementation = "flash_attention_2" + elif attn_implementation == "sdpa": + logger.warning("SDPA is not supported for vit, using " + "flash_attention_2 instead") + vision_attn_implementation = "flash_attention_2" + else: + vision_attn_implementation = attn_implementation + vision_config._attn_implementation = vision_attn_implementation + + self.vision_config = vision_config + + if isinstance(text_config, dict) and "model_type" in text_config: + text_attn_implementation = ("sdpa" if attn_implementation is None + else attn_implementation) + text_config = AriaMoELMConfig(**text_config) + text_config._attn_implementation = text_attn_implementation + + self.text_config = text_config + + # This is needed for the static kv cache + self.num_hidden_layers = self.text_config.num_hidden_layers diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py new file mode 100644 index 0000000000000..9c71b8cada32e --- /dev/null +++ b/vllm/transformers_utils/processors/__init__.py @@ -0,0 +1,4 @@ +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) + +__all__ = ["DeepseekVLV2Processor"] diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py new file mode 100644 index 0000000000000..27cdf6bc22d0e --- /dev/null +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -0,0 +1,361 @@ +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +from typing import List, Tuple + +import torch +import torchvision.transforms as T +from PIL import Image, ImageOps +from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin + + +class ImageTransform: + + def __init__(self, + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True): + self.mean = mean + self.std = std + self.normalize = normalize + + transform_pipelines = [T.ToTensor()] + + if normalize: + transform_pipelines.append(T.Normalize(mean, std)) + + self.transform = T.Compose(transform_pipelines) + + def __call__(self, pil_img: Image.Image): + x = self.transform(pil_img) + return x + + +class DeepseekVLV2Processor(ProcessorMixin): + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + attributes = ["tokenizer"] + + def __init__( + self, + tokenizer: LlamaTokenizerFast, + candidate_resolutions: Tuple[Tuple[int, int]], + patch_size: int, + downsample_ratio: int, + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True, + image_token: str = "", + pad_token: str = "<|▁pad▁|>", + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + + self.candidate_resolutions = candidate_resolutions + self.image_size = candidate_resolutions[0][0] + self.patch_size = patch_size + self.image_mean = image_mean + self.image_std = image_std + self.normalize = normalize + self.downsample_ratio = downsample_ratio + + self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) + self.tokenizer = tokenizer + self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference + + # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' + if tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + + # add image token + image_token_id = self.tokenizer.vocab.get(image_token) + if image_token_id is None: + special_tokens = [image_token] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.image_token_id = self.tokenizer.vocab.get(image_token) + + # add five special tokens for grounding-related tasks + # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> + special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + # add special tokens for SFT data + special_tokens = ["<|User|>", "<|Assistant|>"] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + self.image_token = image_token + self.pad_token = pad_token + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + + super().__init__( + tokenizer, + **kwargs, + ) + + def select_best_resolution(self, image_size): + # used for cropping + original_width, original_height = image_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in self.candidate_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int( + original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def pad_id(self): + return self.tokenizer.pad_token_id + + def encode(self, text: str, bos: bool = True, eos: bool = False): + t = self.tokenizer.encode(text, add_special_tokens=False) + + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + + return t + + def decode(self, t: List[int], **kwargs) -> str: + return self.tokenizer.decode(t, **kwargs) + + def process_one( + self, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + system_prompt (str): the system prompt; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - target_ids (torch.LongTensor): [N + image tokens] + - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + assert (prompt is not None and images is not None + ), "prompt and images must be used at the same time." + + sft_format = prompt + tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( + sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) + masked_tokenized_str = [] + for token_index in tokenized_str: + if token_index != self.image_token_id: + masked_tokenized_str.append(token_index) + else: + masked_tokenized_str.append(self.ignore_id) + + assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ + (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " + f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") + + input_ids = torch.LongTensor(tokenized_str) + target_ids = torch.LongTensor(masked_tokenized_str) + images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) + + # set input_ids < 0 | input_ids == self.image_token_id as ignore_id + target_ids[(input_ids < 0) | + (input_ids == self.image_token_id)] = self.ignore_id + input_ids[input_ids < 0] = self.pad_id + + if inference_mode: + # 去掉结尾的eos token + assert input_ids[-1] == self.eos_id + input_ids = input_ids[:-1] + target_ids = target_ids[:-1] + images_seq_mask = images_seq_mask[:-1] + + if len(images_list) == 0: + pixel_values = torch.zeros((1, 3, self.image_size, self.image_size)) + images_spatial_crop = torch.zeros((1, 2), dtype=torch.long) + else: + pixel_values = torch.stack(images_list, dim=0) + images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) + + input_ids = input_ids.unsqueeze(0) + + prepare = BatchFeature( + data=dict( + input_ids=input_ids, + pixel_values=pixel_values, + images_seq_mask=images_seq_mask, + images_spatial_crop=images_spatial_crop, + num_image_tokens=num_image_tokens, + ), + tensor_type="pt", + ) + return prepare + + def __call__( + self, + *, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - images (torch.FloatTensor): [n_images, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + prepare = self.process_one( + prompt=prompt, + images=images, + inference_mode=inference_mode, + ) + + return prepare + + def tokenize_with_images( + self, + conversation: str, + images: List[Image.Image], + bos: bool = True, + eos: bool = True, + cropping: bool = True, + ): + """Tokenize text with tags.""" + assert conversation.count(self.image_token) == len(images) + text_splits = conversation.split(self.image_token) + images_list, images_seq_mask, images_spatial_crop = [], [], [] + num_image_tokens = [] + tokenized_str = [] + for text_sep, image in zip(text_splits, images): + """encode text_sep""" + tokenized_sep = self.encode(text_sep, bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """select best resolution for anyres""" + if cropping: + best_width, best_height = self.select_best_resolution(image.size) + else: + best_width, best_height = self.image_size, self.image_size + + """process the global view""" + global_view = ImageOps.pad(image, (self.image_size, self.image_size), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + images_list.append(self.image_transform(global_view)) + + """process the local views""" + local_view = ImageOps.pad(image, (best_width, best_height), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + for i in range(0, best_height, self.image_size): + for j in range(0, best_width, self.image_size): + images_list.append( + self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) + + """record height / width crop num""" + num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size + images_spatial_crop.append([num_width_tiles, num_height_tiles]) + + """add image tokens""" + h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) + # global views tokens h * (w + 1), 1 is for line separator + tokenized_image = [self.image_token_id] * h * (w + 1) + # add a separator between global and local views + tokenized_image += [self.image_token_id] + # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) + tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) + + tokenized_str += tokenized_image + images_seq_mask += [True] * len(tokenized_image) + num_image_tokens.append(len(tokenized_image)) + + """process the last text split""" + tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """add the bos and eos tokens""" + if bos: + tokenized_str = [self.bos_id] + tokenized_str + images_seq_mask = [False] + images_seq_mask + if eos: + tokenized_str = tokenized_str + [self.eos_id] + images_seq_mask = images_seq_mask + [False] + + assert len(tokenized_str) == len( + images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" + + return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens + + +AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 17d722e3d88fe..d801cf4e4c7b1 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -18,6 +18,7 @@ Tekkenizer) from vllm.logger import init_logger +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -27,7 +28,7 @@ @dataclass class Encoding: - input_ids: List[int] + input_ids: Union[List[int], List[List[int]]] def maybe_serialize_tool_calls(request: ChatCompletionRequest): @@ -223,17 +224,25 @@ def __len__(self) -> int: def __call__( self, - prompt: str, + prompt: Union[str, List[str], List[int]], add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - # Mistral Tokenizers should not add special tokens - input_ids = self.encode(prompt) - - if truncation: - input_ids = input_ids[:max_length] - + input_ids: Union[List[int], List[List[int]]] + # For List[str], original prompt text + if is_list_of(prompt, str): + input_ids_: List[List[int]] = [] + for p in prompt: + each_input_ids = self.encode_one(p, truncation, max_length) + input_ids_.append(each_input_ids) + input_ids = input_ids_ + # For List[int], apply chat template output, already tokens. + elif is_list_of(prompt, int): + input_ids = prompt + # For str, single prompt text + else: + input_ids = self.encode_one(prompt, truncation, max_length) return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: @@ -245,6 +254,19 @@ def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary return {} + def encode_one( + self, + prompt: str, + truncation: bool = False, + max_length: Optional[int] = None, + ) -> List[int]: + # Mistral Tokenizers should not add special tokens + input_ids = self.encode(prompt) + + if truncation: + input_ids = input_ids[:max_length] + return input_ids + def encode(self, prompt: str) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index a9deee881f41a..841df3994fba2 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -27,6 +27,17 @@ _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} +_USAGE_ENV_VARS_TO_COLLECT = [ + "VLLM_USE_MODELSCOPE", + "VLLM_USE_TRITON_FLASH_ATTN", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_PP_LAYER_PARTITION", + "VLLM_USE_TRITON_AWQ", + "VLLM_USE_V1", + "VLLM_ENABLE_V1_MULTIPROCESSING", +] + def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None: """Set global usage data that will be sent with every usage heartbeat.""" @@ -122,6 +133,7 @@ def __init__(self) -> None: self.gpu_count: Optional[int] = None self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None + self.env_var_json: Optional[str] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -176,6 +188,12 @@ def _report_usage_once(self, model_architecture: str, self.vllm_version = VLLM_VERSION self.model_architecture = model_architecture + # Environment variables + self.env_var_json = json.dumps({ + env_var: getattr(envs, env_var) + for env_var in _USAGE_ENV_VARS_TO_COLLECT + }) + # Metadata self.log_time = _get_current_timestamp_ns() self.source = envs.VLLM_USAGE_SOURCE diff --git a/vllm/utils.py b/vllm/utils.py index 7477e7028f5ef..17bffd2846b46 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -36,6 +36,7 @@ overload) from uuid import uuid4 +import cloudpickle import numpy as np import numpy.typing as npt import psutil @@ -1922,36 +1923,57 @@ def kill_process_tree(pid: int): @dataclass class MemorySnapshot: """Memory snapshot.""" - torch_peak_in_bytes: int = 0 - torch_memory_in_bytes: int = 0 + torch_peak: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() def measure(self): - self.torch_peak_in_bytes = torch.cuda.max_memory_reserved() + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get( + "allocated_bytes.all.peak", 0) + + self.cuda_memory = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) - self.torch_memory_in_bytes = torch.cuda.memory_reserved() + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": - """support a - b""" return MemorySnapshot( - torch_peak_in_bytes=self.torch_peak_in_bytes - - other.torch_peak_in_bytes, - torch_memory_in_bytes=self.torch_memory_in_bytes - - other.torch_memory_in_bytes, - timestamp=self.timestamp - other.timestamp) + torch_peak=self.torch_peak - other.torch_peak, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) @dataclass class MemoryProfilingResult: - """Memory profiling result. - """ # noqa - baseline_memory_in_bytes: int = 0 - non_kv_cache_memory_in_bytes: int = 0 - torch_peak_increase_in_bytes: int = 0 - non_torch_increase_in_bytes: int = 0 - weights_memory_in_bytes: float = 0 + """Memory profiling result. All numbers are in bytes. + """ + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 @@ -1959,18 +1981,14 @@ class MemoryProfilingResult: @contextlib.contextmanager def memory_profiling( - baseline_memory_in_bytes: int, weights_memory_in_bytes: int -) -> Generator[MemoryProfilingResult, None, None]: + baseline_snapshot: MemorySnapshot, + weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. - baseline_memory_in_bytes: memory used by all the components other than - the current vLLM instance. It contains: memory used by other processes, memory - used by another vLLM instance in the same process, etc. It is usually measured - before the current vLLM instance initialize the device. And we assume it is - constant during the profiling of the current vLLM instance. - weights_memory_in_bytes: memory used by PyTorch when loading the model weights. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. Note that, before loading the model weights, we also initialize the device and distributed environment, which may consume some memory. This part is not - included in the weights_memory_in_bytes because PyTorch does not control it. + included in the weights_memory because PyTorch does not control it. The memory in one GPU can be classified into 3 categories: 1. memory used by anything other than the current vLLM instance. @@ -2005,20 +2023,21 @@ def memory_profiling( b. 2 GiB reserved for the peak activation tensors (category 2) c. 1 GiB used by non-torch components (category 3) - The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). - (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`. + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). """ # noqa + gc.collect() + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() result = MemoryProfilingResult() - result.baseline_memory_in_bytes = baseline_memory_in_bytes + result.before_create = baseline_snapshot # the part of memory used for holding the model weights - result.weights_memory_in_bytes = weights_memory_in_bytes + result.weights_memory = weights_memory result.before_profile.measure() @@ -2029,13 +2048,12 @@ def memory_profiling( result.after_profile.measure() - diff = result.after_profile - result.before_profile - result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes - current_cuda_memory_bytes = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] - result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa - result.profile_time = diff.timestamp - result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 @@ -2166,3 +2184,25 @@ def bind_kv_cache( assert len(forward_ctx.kv_cache) == len(kv_cache) for ve, ve_kv_cache in enumerate(kv_cache): forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx] + + +def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any], + kwargs: Dict[str, Any]) -> Any: + """ + Run a method of an object with the given arguments and keyword arguments. + If the method is string, it will be converted to a method using getattr. + If the method is serialized bytes and will be deserialized using + cloudpickle. + If the method is a callable, it will be called directly. + """ + if isinstance(method, bytes): + func = partial(cloudpickle.loads(method), obj) + elif isinstance(method, str): + try: + func = getattr(obj, method) + except AttributeError: + raise NotImplementedError(f"Method {method!r} is not" + " implemented.") from None + else: + func = partial(method, obj) # type: ignore + return func(*args, **kwargs) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 22a5d2fb08a48..bab99fe37caee 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,7 +3,10 @@ from dataclasses import dataclass from typing import Any, List, NamedTuple, Optional, Tuple +from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec, + KVCacheTensor) from vllm.v1.request import Request logger = init_logger(__name__) @@ -305,3 +308,124 @@ def hash_request_tokens(block_size: int, ret.append(block_hash) parent_block_hash_value = block_hash.hash_value return ret + + +def check_enough_kv_cache_memory(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int): + """ + Checks whether `available_memory` is enough for the KV cache to hold at + least one request with the model's max_model_len. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Raises: + ValueError: If there is not enough memory available for the KV cache. + """ + + if available_memory <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + + max_model_len = vllm_config.model_config.max_model_len + needed_memory = 0 + for layer_spec in kv_cache_spec.values(): + needed_memory += layer_spec.bytes_for_tokens(max_model_len) + + if needed_memory > available_memory: + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory/1024/1024/1024:.2f} GB). Try " + f"increasing `gpu_memory_utilization` or decreasing " + f"`max_model_len` when initializing the engine.") + + +def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool: + """ + Whether all layers in the given KVCacheSpec have the same type of KV cache. + + Args: + kv_cache_spec: The KVCacheSpec of the model + + Returns: + True if all layers have the same type, False otherwise. + """ + + layer_keys = set(layer.type_id for layer in kv_cache_spec.values()) + return len(layer_keys) == 1 + + +def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, + kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model with one type of KV cache. + Divide the available memory equally among all layers. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + + page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()} + assert len(page_sizes) == 1 + page_size = page_sizes.pop() + + num_blocks = int(available_memory // page_size // len(kv_cache_spec)) + num_blocks = max(num_blocks, 0) + + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + logger.info("# GPU blocks: %d", num_blocks) + + per_layer_size = page_size * num_blocks + + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, + tensors={ + layer_name: KVCacheTensor(size=per_layer_size) + for layer_name in kv_cache_spec + }, + groups=[[layer_name for layer_name in kv_cache_spec]], + kv_cache_spec=kv_cache_spec) + return kv_cache_config + + +def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec, + available_memory: int) -> KVCacheConfig: + """ + Generates the KV cache configuration for a model + TODO: support hybrid models with more than one type of KV cache. + + Args: + vllm_config: The global VllmConfig + kv_cache_spec: The kv cache spec of the model + available_memory: Memory available for KV cache in bytes. + + Returns: + The generated KVCacheConfig + """ + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) + if is_kv_cache_type_uniform(kv_cache_spec): + # KV cache of all layers are the same, which is true for most models. + # Allocate the same amount of memory for each layer. + return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec, + available_memory) + else: + raise NotImplementedError diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ef616229aa57b..26ebc7edcf03e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -11,11 +11,12 @@ import zmq.asyncio from msgspec import msgpack -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import get_exception_traceback, zmq_socket_ctx +from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, @@ -49,7 +50,7 @@ def __init__( # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( - vllm_config.cache_config) + vllm_config) vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks @@ -65,21 +66,25 @@ def __init__( vllm_config.model_config) def _initialize_kv_caches(self, - cache_config: CacheConfig) -> Tuple[int, int]: + vllm_config: VllmConfig) -> Tuple[int, int]: start = time.time() - num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( - ) - if cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override + # Get all kv cache needed by the model + kv_cache_spec = self.model_executor.get_kv_cache_spec() + + # Profiles the peak memory usage of the model to determine how much + # memory can be allocated for kv cache. + availble_gpu_memory = self.model_executor.determine_available_memory() + # Get the kv cache tensor size + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + availble_gpu_memory) + num_gpu_blocks = kv_cache_config.num_blocks num_cpu_blocks = 0 - self.model_executor.initialize(num_gpu_blocks) + + # Initialize kv cache and warmup the execution + self.model_executor.initialize(kv_cache_config) + elapsed = time.time() - start logger.info(("init engine (profile, create kv cache, " "warmup model) took %.2f seconds"), elapsed) diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 7c17f60510ae1..131be759842c7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,58 +1,92 @@ -from abc import ABC, abstractmethod -from typing import Tuple, Type +from typing import Type from vllm.config import VllmConfig +from vllm.executor.executor_base import ExecutorBase +from vllm.executor.ray_distributed_executor import ( # noqa + RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) +from vllm.executor.uniproc_executor import ( # noqa + UniProcExecutor as UniProcExecutorV0) +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput -class Executor(ABC): - """Abstract class for executors.""" +class Executor(ExecutorBase): + """ + Abstract class for v1 executors, mainly define some methods for v1. + For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod def get_class(vllm_config: VllmConfig) -> Type["Executor"]: executor_class: Type[Executor] + parallel_config = vllm_config.parallel_config distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend) + if distributed_executor_backend is None: + # If the user does not specify the distributed executor backend, + # we will choose the backend based on the world size. + if parallel_config.world_size > 1: + distributed_executor_backend = "mp" + else: + distributed_executor_backend = "uni" + if distributed_executor_backend == "ray": - from vllm.executor.ray_distributed_executor import ( # noqa - RayDistributedExecutor) executor_class = RayDistributedExecutor elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor + elif distributed_executor_backend == "uni": + executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # TODO: make v1 scheduling deterministic + # to support external launcher + executor_class = ExecutorWithExternalLauncher else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + raise ValueError("Unknown distributed executor backend: " + f"{distributed_executor_backend}") return executor_class - @abstractmethod - def __init__(self, vllm_config: VllmConfig) -> None: - raise NotImplementedError + def initialize(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the KV caches and begin the model execution loop of the + underlying workers. + """ + self.collective_rpc("initialize_cache", args=(kv_cache_config, )) + self.collective_rpc("compile_or_warm_up_model") - @abstractmethod - def initialize(self, num_gpu_blocks: int) -> None: - raise NotImplementedError + def determine_available_memory(self) -> int: # in bytes + output = self.collective_rpc("determine_available_memory") + # Since we use a shared centralized controller, we take the minimum + # memory size across all workers to make sure all the memory + # operators can be applied to all workers. + return min(output) - @abstractmethod - def determine_num_available_blocks(self) -> Tuple[int, int]: - raise NotImplementedError + def get_kv_cache_spec(self) -> KVCacheSpec: + output = self.collective_rpc("get_kv_cache_spec") + for x in output: + assert x == output[0] + return output[0] - @abstractmethod def execute_model( self, scheduler_output, ) -> ModelRunnerOutput: - raise NotImplementedError + output = self.collective_rpc("execute_model", + args=(scheduler_output, )) + return output[0] - @abstractmethod def profile(self, is_start: bool = True): - raise NotImplementedError + self.collective_rpc("profile", args=(is_start, )) + + +class UniProcExecutor(UniProcExecutorV0, Executor): + pass + + +class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): + pass - @abstractmethod - def shutdown(self): - pass - @abstractmethod - def check_health(self) -> None: - raise NotImplementedError +class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + pass diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index cee0fcc0bad68..f6cf35da0106b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -6,9 +6,11 @@ import weakref from dataclasses import dataclass from enum import Enum, auto +from functools import partial from multiprocessing.process import BaseProcess -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import cloudpickle import psutil import zmq @@ -23,7 +25,6 @@ from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -34,7 +35,7 @@ class MultiprocExecutor(Executor): - def __init__(self, vllm_config: VllmConfig) -> None: + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) @@ -52,9 +53,6 @@ def sigusr1_handler(signum, frame): signal.signal(signal.SIGUSR1, sigusr1_handler) - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size assert self.world_size == tensor_parallel_size, ( @@ -79,7 +77,8 @@ def sigusr1_handler(signum, frame): # Create workers self.workers: List[WorkerProcHandle] = [] for rank in range(self.world_size): - worker = WorkerProc.make_worker_process(vllm_config, rank, rank, + worker = WorkerProc.make_worker_process(self.vllm_config, rank, + rank, distributed_init_method, scheduler_output_handle) self.workers.append(worker) @@ -90,53 +89,24 @@ def sigusr1_handler(signum, frame): for w in self.workers: w.worker_response_mq.wait_until_ready() - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks by invoking the - underlying worker. - """ - num_blocks = self.collective_rpc("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - def collective_rpc(self, - method: str, + method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: - """ - Execute an RPC call on workers. - - Args: - method: Name of the worker method to execute - timeout: Maximum time in seconds to wait for execution. Rases a - TimeoutError on timeout. None means wait indefinitely. - args: Positional arguments to pass to the worker method - kwargs: Keyword arguments to pass to the worker method - - Returns: - List of results from each worker - """ start_time = time.monotonic() kwargs = kwargs or {} + # NOTE: If the args are heterogeneous, then we pack them into a list, + # and unpack them in the method of every worker, because every worker + # knows their own rank. try: - self.rpc_broadcast_mq.enqueue((method, args, kwargs)) + if isinstance(method, str): + send_method = method + else: + send_method = cloudpickle.dumps( + method, protocol=pickle.HIGHEST_PROTOCOL) + self.rpc_broadcast_mq.enqueue((send_method, args, kwargs)) responses = [None] * self.world_size for w in self.workers: @@ -160,18 +130,6 @@ def collective_rpc(self, # Re-raise any other exceptions raise e - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - model_output = self.collective_rpc("execute_model", - args=(scheduler_output, ))[0] - return model_output - - def profile(self, is_start: bool = True): - self.collective_rpc("profile", args=(is_start, )) - return - def _ensure_worker_termination(self): """Ensure that all worker processes are terminated. Assumes workers have received termination requests. Waits for processing, then sends @@ -246,7 +204,7 @@ def __init__( ready_path: str, ): self.rank = rank - wrapper = WorkerWrapperBase(vllm_config=vllm_config, rank=rank) + wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank) # TODO: move `init_worker` to executor level as a collective rpc call all_kwargs: List[Dict] = [ {} for _ in range(vllm_config.parallel_config.world_size) @@ -403,7 +361,11 @@ def worker_busy_loop(self): method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: - output = getattr(self.worker, method)(*args, **kwargs) + if isinstance(method, str): + func = getattr(self.worker, method) + elif isinstance(method, bytes): + func = partial(cloudpickle.loads(method), self.worker) + output = func(*args, **kwargs) except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py deleted file mode 100644 index 79acc60001c99..0000000000000 --- a/vllm/v1/executor/ray_executor.py +++ /dev/null @@ -1,342 +0,0 @@ -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import (RayWorkerWrapper, - initialize_ray_cluster, ray) -from vllm.v1.outputs import ModelRunnerOutput - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.model_config = vllm_config.model_config - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - initialize_ray_cluster(self.parallel_config) - placement_group = self.parallel_config.placement_group - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # A list of workers to run a model. - self.workers: List[RayWorkerWrapper] = [] - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - # Skip bundles that don't have GPUs, - # as each worker needs one GPU. - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=1, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. This is simply a tiebreaker to make - sure the workers are sorted in a deterministic way. - """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids") - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - "VLLM_USE_V1": - str(int(envs.VLLM_USE_V1)), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - self._run_workers("initialize") - self._run_workers("load_model") - - def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> Dict[str, Any]: - # If nsight profiling is enabled, we need to set the profiling - # configuration for the ray workers as runtime env. - runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) - runtime_env.update({ - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - }) - - return ray_remote_kwargs - - def _get_env_vars_to_be_updated(self): - return self._env_vars_for_all_workers - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """ - Return worker init args for a given rank. - """ - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """ - Determine the number of available KV blocks. - - This invokes `determine_num_available_blocks` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - - Returns: - - tuple[num_gpu_blocks, num_cpu_blocks] - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers("determine_num_available_blocks") - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - - return num_gpu_blocks, num_cpu_blocks - - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV cache in all workers. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self._run_workers("initialize_cache", num_gpu_blocks) - self._run_workers("compile_or_warm_up_model") - - def _run_workers( - self, - method: str, - *args, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - **kwargs, - ) -> Any: - """ - Runs the given method on all workers. Can be used in the following - ways: - - Args: - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 0, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 0, None) - - ray_worker_refs = [ - worker.execute_method.remote( # type: ignore[attr-defined] - method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - return ray.get(ray_worker_refs) - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag() - # Only the first worker (with rank 0) returns the execution result. - # Others return None. - output = ray.get(self.forward_dag.execute(scheduler_output))[0] - return output - - def profile(self, is_start=True): - raise NotImplementedError - - def shutdown(self): - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None - - def check_health(self) -> None: - logger.debug("Called check_health.") - - def _check_ray_compiled_graph_installation(self): - import pkg_resources - from packaging import version - - required_version = version.parse("2.39") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - if current_version < required_version: - raise ValueError(f"Ray version {required_version} is " - f"required, but found {current_version}") - - import importlib.util - raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref") - if raycg is None: - raise ValueError("Ray Compiled Graph is not installed. " - "Run `pip install ray[adag]` to install it.") - - cupy_spec = importlib.util.find_spec("cupy") - if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: - raise ValueError( - "cupy is not installed but required since " - "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." - "Run `pip install ray[adag]` and check cupy installation.") - - def _compiled_ray_dag(self): - assert self.parallel_config.use_ray - self._check_ray_compiled_graph_installation() - from ray.dag import InputNode, MultiOutputNode - - with InputNode() as input_batches: - outputs = [ - worker.execute_model.bind( # type: ignore[attr-defined] - input_batches) for worker in self.workers - ] - forward_dag = MultiOutputNode(outputs) - - return forward_dag.experimental_compile() - - def __del__(self): - self.shutdown() diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py deleted file mode 100644 index fc9715b7a5909..0000000000000 --- a/vllm/v1/executor/ray_utils.py +++ /dev/null @@ -1,280 +0,0 @@ -import time -from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple - -from vllm.config import ParallelConfig -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import get_ip -from vllm.v1.outputs import ModelRunnerOutput -from vllm.worker.worker_base import WorkerWrapperBase - -if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput - -logger = init_logger(__name__) -PG_WAIT_TIMEOUT = 60 - -try: - import ray - from ray.util import placement_group_table - from ray.util.placement_group import PlacementGroup - try: - from ray._private.state import available_resources_per_node - except ImportError: - # Ray 2.9.x doesn't expose `available_resources_per_node` - from ray._private.state import state as _state - available_resources_per_node = _state._available_resources_per_node - - class RayWorkerWrapper(WorkerWrapperBase): - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - # Since the compiled DAG runs a main execution - # in a different thread that calls cuda.set_device. - # The flag indicates is set_device is called on - # that thread. It will be removed soon. - self.compiled_dag_cuda_device_set = False - - def get_node_ip(self) -> str: - return get_ip() - - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: - node_id = ray.get_runtime_context().get_node_id() - device_key = current_platform.ray_device_key - if not device_key: - raise RuntimeError("current platform %s does not support ray.", - current_platform.device_name) - gpu_ids = ray.get_runtime_context().get_accelerator_ids( - )[device_key] - return node_id, gpu_ids - - def setup_device_if_necessary(self): - # TODO(swang): This is needed right now because Ray CG executes - # on a background thread, so we need to reset torch's current - # device. - # We can remove this API after it is fixed in compiled graph. - import torch - assert self.worker is not None, "Worker is not initialized" - if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) - self.compiled_dag_cuda_device_set = True - - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: - self.setup_device_if_necessary() - assert self.worker is not None, "Worker is not initialized" - output = self.worker.model_runner.execute_model(scheduler_output) - return output - - ray_import_err = None - -except ImportError as e: - ray = None # type: ignore - ray_import_err = e - RayWorkerWrapper = None # type: ignore - - -def ray_is_available() -> bool: - """Returns True if Ray is available.""" - return ray is not None - - -def assert_ray_available(): - """ - Raise an exception if Ray is not available. - """ - if ray is None: - raise ValueError("Failed to import Ray, please install Ray with " - "`pip install ray`.") from ray_import_err - - -def _verify_bundles(placement_group: "PlacementGroup", - parallel_config: ParallelConfig, device_str: str): - """ - Verify a given placement group has bundles located in the right place. - - There are 2 rules. - - Warn if all tensor parallel workers cannot fit in a single node. - - Fail if driver node is not included in a placement group. - - Args: - placement_group: The placement group to verify. - parallel_config: The parallel configuration. - device_str: The required device. - """ - assert ray.is_initialized(), ( - "Ray is not initialized although distributed-executor-backend is ray.") - pg_data = placement_group_table(placement_group) - # bundle_idx -> node_id - bundle_to_node_ids = pg_data["bundles_to_node_id"] - # bundle_idx -> bundle (e.g., {"GPU": 1}) - bundles = pg_data["bundles"] - # node_id -> List of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) - - for bundle_idx, node_id in bundle_to_node_ids.items(): - node_id_to_bundle[node_id].append(bundles[bundle_idx]) - driver_node_id = ray.get_runtime_context().get_node_id() - - if driver_node_id not in node_id_to_bundle: - raise RuntimeError( - f"driver node id {driver_node_id} is not included in a placement " - f"group {placement_group.id}. Node id -> bundles " - f"{node_id_to_bundle}. " - "You don't have enough GPUs available in a current node. Check " - "`ray status` to see if you have available GPUs in a node " - f"{driver_node_id} before starting an vLLM engine.") - - for node_id, bundles in node_id_to_bundle.items(): - if len(bundles) < parallel_config.tensor_parallel_size: - logger.warning( - "tensor_parallel_size=%d " - "is bigger than a reserved number of %ss (%d " - "%ss) in a node %s. Tensor parallel workers can be " - "spread out to 2+ nodes which can degrade the performance " - "unless you have fast interconnect across nodes, like " - "Infiniband. To resolve this issue, make sure you have more " - "than %d GPUs available at each node.", - parallel_config.tensor_parallel_size, device_str, len(bundles), - device_str, node_id, parallel_config.tensor_parallel_size) - - -def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): - """Wait until a placement group is ready. - - It prints the informative log messages if the placement group is - not created within time. - - """ - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - placement_group_specs = current_placement_group.bundle_specs - - s = time.time() - pg_ready_ref = current_placement_group.ready() - wait_interval = 10 - while time.time() - s < PG_WAIT_TIMEOUT: - ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) - if len(ready) > 0: - break - - # Exponential backoff for warning print. - wait_interval *= 2 - logger.info( - "Waiting for creating a placement group of specs for " - "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", - int(time.time() - s), placement_group_specs) - - try: - ray.get(pg_ready_ref, timeout=0) - except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` to make sure the cluster has enough resources." - ) from None - - -def initialize_ray_cluster( - parallel_config: ParallelConfig, - ray_address: Optional[str] = None, -): - """Initialize the distributed cluster with Ray. - - it will connect to the Ray cluster and create a placement group - for the workers, which includes the specification of the resources - for each distributed worker. - - Args: - parallel_config: The configurations for parallel execution. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. - """ - assert_ray_available() - - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): - # Try to connect existing ray instance and create a new one if not found - try: - ray.init("auto") - except ConnectionError: - logger.warning( - "No existing RAY instance detected. " - "A new instance will be launched with current node resources.") - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if parallel_config.placement_group: - # Placement group is already set. - return - - device_str = current_platform.ray_device_key - if not device_str: - raise ValueError( - f"current platform {current_platform.device_name} does not " - "support ray.") - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - device_bundles = 0 - for bundle in bundles: - bundle_devices = bundle.get(device_str, 0) - if bundle_devices > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 " - f"{device_str}.") - if bundle_devices: - device_bundles += 1 - if parallel_config.world_size > device_bundles: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group." - f"Required number of devices: {parallel_config.world_size}. " - f"Total number of devices: {device_bundles}.") - else: - num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) - if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") - # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{ - device_str: 1.0 - } for _ in range(parallel_config.world_size)]) - - # vLLM engine is also a worker to execute model with an accelerator, - # so it requires to have the device in a current node. Check if - # the current node has at least one device. - current_ip = get_ip() - current_node_id = ray.get_runtime_context().get_node_id() - current_node_resource = available_resources_per_node()[current_node_id] - if current_node_resource.get(device_str, 0) < 1: - raise ValueError( - f"Current node has no {device_str} available. " - f"{current_node_resource=}. vLLM engine cannot start without " - f"{device_str}. Make sure you have at least 1 {device_str} " - f"available in a node {current_node_id=} {current_ip=}.") - # This way, at least bundle is required to be created in a current - # node. - placement_group_specs[0][f"node:{current_ip}"] = 0.001 - - # By default, Ray packs resources as much as possible. - current_placement_group = ray.util.placement_group( - placement_group_specs, strategy="PACK") - _wait_until_pg_ready(current_placement_group) - - assert current_placement_group is not None - _verify_bundles(current_placement_group, parallel_config, device_str) - # Set the placement group in the parallel config - parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py deleted file mode 100644 index c63d7a4c47c15..0000000000000 --- a/vllm/v1/executor/uniproc_executor.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -from typing import Optional, Tuple - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.worker.gpu_worker import Worker - -logger = init_logger(__name__) - - -class UniprocExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - - self.worker: Worker = self._create_worker() - self.worker.init_device() - self.worker.load_model() - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Worker: - """Return worker init args for a given rank.""" - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return Worker( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of available KV blocks by invoking the - underlying worker. - """ - return self.worker.determine_num_available_blocks() - - def initialize(self, num_gpu_blocks: int) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - # NOTE: This is logged in the executor because there can be >1 worker - # with other executors. We could log in the engine level, but work - # remains to abstract away the device for non-GPU configurations. - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.worker.initialize_cache(num_gpu_blocks) - self.worker.compile_or_warm_up_model() - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - output = self.worker.execute_model(scheduler_output) - assert output is not None - return output - - def profile(self, is_start: bool = True): - self.worker.profile(is_start) - - def shutdown(self): - pass - - def check_health(self) -> None: - # UniprocExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py new file mode 100644 index 0000000000000..6d5cc32ffc5b8 --- /dev/null +++ b/vllm/v1/kv_cache_interface.py @@ -0,0 +1,111 @@ +from dataclasses import dataclass +from typing import Dict, List + +import torch + +from vllm.logger import init_logger +from vllm.utils import cdiv, get_dtype_size + +logger = init_logger(__name__) + + +@dataclass +class KVCacheSpecBase: + """ + A base class for specifying the KV cache format of one layer. + """ + + # number of tokens in a block + block_size: int + + @property + def type_id(self) -> str: + """ + The type identifier of this KV cache. + Return different strings for layers with different KV cache type (e.g., + different number of tokens like full attention vs sliding window + attention, different KV cache size per token like layers with different + number of heads) + + Returns: + The type identifier of this KV cache. + """ + raise NotImplementedError + + @property + def page_size_bytes(self) -> int: + """ + The size of a page with `block_size` tokens in bytes. + + Returns: + The page size + """ + raise NotImplementedError + + def bytes_for_tokens(self, num_tokens: int) -> int: + """ + The KV cache size for `num_tokens` tokens in bytes. Returns the real + memory size after padding `num_tokens` to full blocks. + + Returns: + The KV cache size + """ + raise NotImplementedError + + +@dataclass +class FullAttentionSpec(KVCacheSpecBase): + num_kv_heads: int + head_size: int + dtype: torch.dtype + + @property + def type_id(self) -> str: + return f"full_attention_{self.block_size}_{self.page_size_bytes}" + + @property + def page_size_bytes(self) -> int: + return 2 * self.block_size * self.num_kv_heads * self.head_size \ + * get_dtype_size(self.dtype) + + def bytes_for_tokens(self, num_tokens: int) -> int: + return cdiv(num_tokens, self.block_size) * self.page_size_bytes + + +KVCacheSpec = Dict[str, KVCacheSpecBase] + + +@dataclass +class KVCacheTensor: + """ + A dataclass for specifying how the workers should initialize the KV cache + for a layer. Only contains the size of KV cache for that layer for now. Will + be extended to support multiple layers sharing the same memory pool. + """ + size: int # The size of KV cache Tensor in bytes + + +@dataclass +class KVCacheConfig: + """ + The KV cache configuration of a model. + """ + """The number of KV cache blocks""" + num_blocks: int + """layer_name -> how to initialize KV cache for that layer""" + tensors: Dict[str, KVCacheTensor] + """ + A list of kv-cache groups. Each group includes a set of layers with + the same kv-cache spec, and the total page_size of layers inside a group + is same across all groups (as the KVCacheManager only supports allocating + pages of the same size). For example: + 1. A model only uses full attention: one group with all layers in the model. + 2. (not implemented yet) A model with the same number of full attention + layers and sliding window attention layers: two groups, one for full + attention layers and one for sliding window attention layers. + 3. (not implemented yet) A model with 2 full attention layers and 4 sliding + window attention layers: three groups, (full * 2), (sw * 2), (sw * 2). + """ + groups: List[List[str]] + """the KVCacheSpec of the model""" + kv_cache_spec: KVCacheSpec diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index b0a7affbebb7e..8dfcf2dd78606 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,13 +1,20 @@ import multiprocessing import os import weakref +from collections import defaultdict from collections.abc import Sequence -from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, - Union, overload) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List, + Optional, TypeVar, Union, overload) + +import torch from vllm.logger import init_logger +from vllm.model_executor.models.utils import extract_layer_index from vllm.utils import get_mp_context, kill_process_tree +if TYPE_CHECKING: + from vllm.attention.layer import Attention + logger = init_logger(__name__) T = TypeVar("T") @@ -134,3 +141,48 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): socket_file = ipc_socket.replace("ipc://", "") if os and os.path.exists(socket_file): os.remove(socket_file) + + +def bind_kv_cache( + kv_caches: Dict[str, torch.Tensor], + forward_context: Dict[str, "Attention"], + runner_kv_caches: List[torch.Tensor], +) -> None: + """ + Bind the allocated KV cache to both ModelRunner and forward context so + that the KV cache can be used in the forward pass. + + This function: + 1) Fills the ModelRunner's kv cache list (`runner_kv_caches`) with + kv_caches. + 2) Associates each attention layer in the `forward_context` with its + corresponding KV cache in kv_caches. + + Args: + kv_caches: The allocated kv_caches with layer names as keys. + forward_context: The global forward context containing all Attention + layers with layer names as keys. + runner_kv_caches: The kv_cache declared by ModelRunner. + """ + # Bind kv_caches to ModelRunner + assert len(runner_kv_caches) == 0 + + # Convert kv_caches dict to a list of tensors in the order of layer_index. + index2name = defaultdict(list) + for layer_name in kv_caches: + index2name[extract_layer_index(layer_name)].append(layer_name) + + for layer_index in sorted(index2name.keys()): + layer_names = index2name[layer_index] + if len(layer_names) > 1: + # One typical case is encoder-decoder model, e.g., bart. + # The cross attention and self attention in the same decoder layer + # has different layer_name but the same layer_index. + raise NotImplementedError + layer_name = layer_names[0] + runner_kv_caches.append(kv_caches[layer_name]) + + # Bind kv_caches to forward context + for layer_name, kv_cache in kv_caches.items(): + # NOTE: Use list because of v0 PP virtual engine. + forward_context[layer_name].kv_cache = [kv_cache] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 40494e64b22f0..28d8e39053874 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -30,6 +30,9 @@ class CachedRequestState: num_computed_tokens: int output_token_ids: List[int] + mrope_positions: Optional[torch.Tensor] = None + mrope_position_delta: Optional[int] = None + @property def num_tokens(self) -> int: return len(self.prompt_token_ids) + len(self.output_token_ids) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index de83640b27cd6..2350074c23a59 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -7,23 +7,28 @@ import torch.distributed import torch.nn as nn +from vllm.attention.backends.abstract import AttentionType +from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - LayerBlockType, bind_kv_cache, cdiv, - is_pin_memory_available) + LayerBlockType, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: @@ -135,6 +140,32 @@ def __init__( self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + # NOTE: `mrope_positions` is implemented as a permuted tensor to + # satisfy the following properties to allow `torch.compile` to work + # properly: + # - shape: (3, ) + # - stride: (1, 3) + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1921022256 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros((self.max_num_tokens, 3), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + + self.mrope_positions = self.mrope_positions.permute((1, 0)) + self.mrope_positions_cpu = self.mrope_positions_cpu.permute((1, 0)) + self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, @@ -242,6 +273,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + ) + req_ids_to_add.append(req_id) # Update the cached states of the resumed requests. @@ -309,6 +369,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): arange, out=positions_np) + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + self._calc_mrope_positions(scheduler_output) + # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] @@ -355,8 +420,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - self.positions[:total_num_scheduled_tokens].copy_( - self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.model_config.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( self.device, non_blocking=True) seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( @@ -468,6 +541,61 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + num_reqs = self.input_batch.num_reqs + for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + def _prepare_sampling( self, scheduler_output: "SchedulerOutput", @@ -561,6 +689,9 @@ def _gather_encoder_outputs( encoder_outputs.append(encoder_output[start_idx:end_idx]) return encoder_outputs + def get_model(self) -> nn.Module: + return self.model + @torch.inference_mode() def execute_model( self, @@ -614,9 +745,12 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): + positions = self.mrope_positions[:, :num_input_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_input_tokens] hidden_states = self.model( input_ids=input_ids, - positions=self.positions[:num_input_tokens], + positions=positions, kv_caches=self.kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -703,9 +837,12 @@ def _dummy_run( input_ids = self.input_ids[:num_tokens] inputs_embeds = None with set_forward_context(None, self.vllm_config): + positions = self.mrope_positions[:, :num_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_tokens] hidden_states = model( input_ids=input_ids, - positions=self.positions[:num_tokens], + positions=positions, kv_caches=kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -856,15 +993,71 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) - def initialize_kv_cache(self, num_blocks: int) -> None: - assert len(self.kv_caches) == 0 - kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( - num_blocks, self.block_size, self.num_kv_heads, self.head_size) - for _ in range(self.num_attn_layers): - self.kv_caches.append( - torch.zeros(kv_cache_shape, - dtype=self.kv_cache_dtype, - device=self.device)) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + if len(kv_cache_config.groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") + + kv_caches: Dict[str, torch.Tensor] = {} + + for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items(): + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % layer_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // layer_spec.page_size_bytes + if isinstance(layer_spec, FullAttentionSpec): + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks, layer_spec.block_size, layer_spec.num_kv_heads, + layer_spec.head_size) + dtype = layer_spec.dtype + kv_caches[layer_name] = torch.zeros(kv_cache_shape, + dtype=dtype, + device=self.device) + else: + raise NotImplementedError + bind_kv_cache( + kv_caches, self.vllm_config.compilation_config.static_forward_context, - [self.kv_caches]) + self.kv_caches) + + def get_kv_cache_spec(self) -> KVCacheSpec: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + forward_ctx = self.vllm_config.compilation_config.static_forward_context + block_size = self.vllm_config.cache_config.block_size + kv_cache_spec: KVCacheSpec = {} + for layer_name, attn_module in forward_ctx.items(): + # TODO: Support other attention modules, e.g., sliding window, + # cross-attention, MLA. + assert isinstance(attn_module, Attention) + if attn_module.attn_type == AttentionType.DECODER: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=attn_module.dtype, + ) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + raise NotImplementedError + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + return kv_cache_spec diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 81b247e07ef4a..0929e64d58f1e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,10 +1,11 @@ """A GPU worker class.""" import gc import os -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig @@ -16,6 +17,7 @@ from vllm.platforms import current_platform from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size from vllm.v1.core.scheduler import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -112,20 +114,18 @@ def load_model(self) -> None: self.model_runner.load_model() @torch.inference_mode() - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. + def determine_available_memory(self) -> int: + """Profiles the peak memory usage of the model to determine how much + memory can be used for KV cache without OOMs. The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. + Then, it calculate the free memory that can be used for KV cache in + bytes. .. tip:: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() @@ -161,33 +161,14 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = _get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - return num_gpu_blocks, 0 - - def initialize_cache(self, num_gpu_blocks: int) -> None: - """Allocate GPU and CPU KV cache with the specified number of blocks.""" - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - - max_seq_len = self.cache_config.block_size * num_gpu_blocks - max_model_len = self.model_config.max_model_len - if max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") + return int(available_kv_cache_memory) + + def get_kv_cache_spec(self) -> KVCacheSpec: + return self.model_runner.get_kv_cache_spec() - self.model_runner.initialize_kv_cache(num_gpu_blocks) + def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: + """Allocate GPU KV cache with the specified kv_cache_config.""" + self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: if not self.model_config.enforce_eager: @@ -196,6 +177,9 @@ def compile_or_warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 303d9a15e9c3c..abbf6450ab7f6 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -509,6 +509,9 @@ def load_model(self) -> None: ) self.model = self.lora_manager.create_lora_manager(self.model) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 260ffaf27f9a1..4c8f69e449393 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -21,6 +21,7 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import torch.nn as nn from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, HabanaMemoryProfiler, format_bytes) @@ -676,6 +677,9 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) + def get_model(self) -> nn.Module: + return self.model + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ae8b7f97c827d..cb2ff0c934da3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1176,6 +1176,9 @@ def load_model(self) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + def get_model(self) -> nn.Module: + return self.model + def save_sharded_state( self, path: str, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index c7abad7e0258d..acfd6d0b03f62 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -7,6 +7,7 @@ Optional, Type, TypeVar) import torch +import torch.nn as nn from torch import is_tensor from vllm.config import VllmConfig @@ -264,6 +265,10 @@ def prepare_model_input( """ raise NotImplementedError + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + def execute_model( self, model_input: T, @@ -297,9 +302,9 @@ class ModelRunnerWrapperBase: def __init__( self, - moderl_runner: ModelRunnerBase, + model_runner: ModelRunnerBase, ) -> None: - self.model_runner: ModelRunnerBase = moderl_runner + self.model_runner: ModelRunnerBase = model_runner def __getattr__(self, attr): return getattr(self.model_runner, attr) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a35f5467e1a1f..596c26eac28bd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -113,6 +113,9 @@ def load_model(self) -> None: raise NotImplementedError( "Supports only Transformer-NeuronX based models.") + def get_model(self) -> nn.Module: + return self.model + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a38b5a4e6e8d5..9d0a759ca2f21 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -84,6 +84,9 @@ def load_model(self) -> None: kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 50a155d22c666..f5b46cde3969c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -4,6 +4,7 @@ import openvino as ov import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.attention import get_attn_backend @@ -362,6 +363,9 @@ def cache_copy( ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 52c577bccab9c..f5c7bc955a673 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -158,6 +158,9 @@ def load_model(self) -> None: fullgraph=True, dynamic=False) + def get_model(self) -> nn.Module: + return self.model.model + def _dummy_run( self, batch_size: int, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index a3e377ef2b19d..29d62ddda3dc0 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -21,7 +21,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling +from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, + memory_profiling) from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -55,9 +56,6 @@ def __init__( self.rank = rank self.distributed_init_method = distributed_init_method self.is_driver_worker = is_driver_worker - if is_driver_worker: - assert rank % self.parallel_config.tensor_parallel_size == 0, \ - "Driver worker should be rank 0 of tensor parallel group." if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules @@ -140,7 +138,8 @@ def init_device(self) -> None: _check_if_gpu_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() - self.init_gpu_memory = torch.cuda.mem_get_info()[0] + torch.cuda.reset_peak_memory_stats() + self.baseline_snapshot = MemorySnapshot() else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -195,10 +194,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - with memory_profiling(baseline_memory_in_bytes=total_gpu_memory - - self.init_gpu_memory, - weights_memory_in_bytes=self.model_runner. - model_memory_usage) as result: + with memory_profiling( + self.baseline_snapshot, + weights_memory=self.model_runner.model_memory_usage) as result: self.model_runner.profile_run() self._assert_memory_footprint_increased_during_profiling() @@ -206,7 +204,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: memory_for_current_instance = total_gpu_memory * \ self.cache_config.gpu_memory_utilization available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory_in_bytes) + result.non_kv_cache_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -229,11 +227,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: f"({self.cache_config.gpu_memory_utilization:.2f})" f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" "model weights take " - f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.weights_memory / GiB_bytes):.2f}GiB;" " non_torch_memory takes " - f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" " the rest of the memory reserved for KV Cache is " f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") @@ -249,11 +247,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - free_gpu_memory, _ = torch.cuda.mem_get_info() - assert self.init_gpu_memory - free_gpu_memory > 0, ( + free_gpu_memory, total = torch.cuda.mem_get_info() + cuda_memory = total - free_gpu_memory + assert self.baseline_snapshot.cuda_memory < cuda_memory, ( "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " + f"Initial used memory {self.baseline_snapshot.cuda_memory}, " + f"currently used memory {cuda_memory}. " + f"This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") def initialize_cache(self, num_gpu_blocks: int, diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7c14b8344b49e..1104eceef72a3 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -4,7 +4,9 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +import cloudpickle import torch +import torch.nn as nn from vllm.config import ObservabilityConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group @@ -13,7 +15,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, - resolve_obj_by_qualname, update_environment_variables) + resolve_obj_by_qualname, run_method, + update_environment_variables) from vllm.worker.model_runner_base import (BroadcastableModelInput, ModelRunnerBase, ModelRunnerInputBase) @@ -88,6 +91,11 @@ def start_worker_execution_loop(self) -> None: if output is None: return None + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -145,6 +153,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -361,6 +372,9 @@ def prepare_input( else: return self._get_worker_input_from_broadcast() + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, @@ -461,7 +475,8 @@ def _execute_model_spmd( class WorkerWrapperBase: """ - The whole point of this class is to lazily initialize the worker. + This class represents one process in an executor/engine. It is responsible + for lazily initializing the worker and handling the worker's lifecycle. We first instantiate the WorkerWrapper, which remembers the worker module and class name. Then, when we call `update_environment_variables`, and the real initialization happens in `init_worker`. @@ -470,9 +485,19 @@ class WorkerWrapperBase: def __init__( self, vllm_config: VllmConfig, - rank: int = 0, + rpc_rank: int = 0, ) -> None: - self.rank = rank + """ + Initialize the worker wrapper with the given vllm_config and rpc_rank. + Note: rpc_rank is the rank of the worker in the executor. In most cases, + it is also the rank of the worker in the distributed group. However, + when multiple executors work together, they can be different. + e.g. in the case of SPMD-style offline inference with TP=2, + users can launch 2 engines/executors, each with only 1 worker. + All workers have rpc_rank=0, but they have different ranks in the TP + group. + """ + self.rpc_rank = rpc_rank self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None if vllm_config.model_config is not None: @@ -485,16 +510,16 @@ def __init__( def adjust_rank(self, rank_mapping: Dict[int, int]) -> None: """ - Adjust the rank based on the given mapping. + Adjust the rpc_rank based on the given mapping. It is only used during the initialization of the executor, - to adjust the rank of workers after we create all workers. + to adjust the rpc_rank of workers after we create all workers. """ - if self.rank in rank_mapping: - self.rank = rank_mapping[self.rank] + if self.rpc_rank in rank_mapping: + self.rpc_rank = rank_mapping[self.rpc_rank] def update_environment_variables(self, envs_list: List[Dict[str, str]]) -> None: - envs = envs_list[self.rank] + envs = envs_list[self.rpc_rank] key = 'CUDA_VISIBLE_DEVICES' if key in envs and key in os.environ: # overwriting CUDA_VISIBLE_DEVICES is desired behavior @@ -507,31 +532,36 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. """ - kwargs = all_kwargs[self.rank] + kwargs = all_kwargs[self.rpc_rank] enable_trace_function_call_for_thread(self.vllm_config) - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' + from vllm import configure_as_vllm_process + configure_as_vllm_process() from vllm.plugins import load_general_plugins load_general_plugins() - worker_class = resolve_obj_by_qualname( - self.vllm_config.parallel_config.worker_cls) + if isinstance(self.vllm_config.parallel_config.worker_cls, str): + worker_class = resolve_obj_by_qualname( + self.vllm_config.parallel_config.worker_cls) + else: + assert isinstance(self.vllm_config.parallel_config.worker_cls, + bytes) + worker_class = cloudpickle.loads( + self.vllm_config.parallel_config.worker_cls) self.worker = worker_class(**kwargs) assert self.worker is not None - def execute_method(self, method: str, *args, **kwargs): + def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: target = self if self.worker is None else self.worker - executor = getattr(target, method) - return executor(*args, **kwargs) + return run_method(target, method, args, kwargs) except Exception as e: # if the driver worker also execute methods, # exceptions in the rest worker may cause deadlock in rpc like ray # see https://github.com/vllm-project/vllm/issues/3455 # print the error and inform the user to solve the error - msg = (f"Error executing method {method}. " + msg = (f"Error executing method {method!r}. " "This might cause deadlock in distributed execution.") logger.exception(msg) raise e diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 82b8f22a5af33..25a2fea1e8eac 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -416,6 +416,9 @@ def load_model(self) -> None: logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) + def get_model(self) -> nn.Module: + return self.model + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size()