diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 97afd301c8f24..a39d1cf842f06 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,7 +10,7 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs from vllm.inputs import PromptInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser @@ -205,13 +205,11 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument( - "--device", - type=str, - default="auto", - choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA, OpenVINO and ' - 'CPU.') + parser.add_argument("--device", + type=str, + default="auto", + choices=DEVICE_OPTIONS, + help='device type for vLLM execution') parser.add_argument('--block-size', type=int, default=16, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 94549d84fb4e4..3f531ee82cc94 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -11,7 +11,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -451,13 +451,11 @@ def main(args: argparse.Namespace): 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="auto", - choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], - help='device type for vLLM execution, supporting CUDA, OpenVINO and ' - 'CPU.') + parser.add_argument("--device", + type=str, + default="auto", + choices=DEVICE_OPTIONS, + help='device type for vLLM execution') parser.add_argument( "--num-scheduler-steps", type=int, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7748e11092040..3cd26f6770ed3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -26,6 +26,16 @@ ALLOWED_DETAILED_TRACE_MODULES = ["model", "worker", "all"] +DEVICE_OPTIONS = [ + "auto", + "cuda", + "neuron", + "cpu", + "openvino", + "tpu", + "xpu", +] + def nullable_str(val: str): if not val or val == "None": @@ -553,10 +563,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--device", type=str, default=EngineArgs.device, - choices=[ - "auto", "cuda", "neuron", "cpu", "openvino", - "tpu", "xpu" - ], + choices=DEVICE_OPTIONS, help='Device type for vLLM execution.') parser.add_argument('--num-scheduler-steps', type=int,