Skip to content

Commit

Permalink
Add HPU specific arguments to benchmark_throughput (vllm-project#406)
Browse files Browse the repository at this point in the history
Modify `benchmark_throughput.py` to allow running with FP8 on HPU (KV
cache dtype `fp8_inc`) and to use padding-aware scheduling.
  • Loading branch information
kdamaszk authored Oct 22, 2024
1 parent 07c98a5 commit acde882
Showing 1 changed file with 36 additions and 2 deletions.
38 changes: 36 additions & 2 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ def run_vllm(
download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
weights_load_device: str = None,
use_padding_aware_scheduling: bool = False,
max_num_seqs: int = 256,
max_num_prefill_seqs: int = None,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
Expand All @@ -115,6 +119,10 @@ def run_vllm(
num_scheduler_steps=num_scheduler_steps,
use_v2_block_manager=use_v2_block_manager,
disable_async_output_proc=disable_async_output_proc,
weights_load_device=weights_load_device,
use_padding_aware_scheduling=use_padding_aware_scheduling,
max_num_seqs=max_num_seqs,
max_num_prefill_seqs=max_num_prefill_seqs,
)

# Add the requests to the engine.
Expand Down Expand Up @@ -181,6 +189,10 @@ async def run_vllm_async(
load_format: str = EngineArgs.load_format,
disable_async_output_proc: bool = False,
disable_frontend_multiprocessing: bool = False,
weights_load_device: str = None,
use_padding_aware_scheduling: bool = False,
max_num_seqs: int = 256,
max_num_prefill_seqs: int = None,
) -> float:
from vllm import SamplingParams
engine_args = AsyncEngineArgs(
Expand Down Expand Up @@ -208,6 +220,9 @@ async def run_vllm_async(
disable_async_output_proc=disable_async_output_proc,
worker_use_ray=False,
disable_log_requests=True,
weights_load_device=weights_load_device,
use_padding_aware_scheduling=use_padding_aware_scheduling,
max_num_prefill_seqs=max_num_prefill_seqs,
)

async with build_async_engine_client_from_engine_args(
Expand Down Expand Up @@ -342,7 +357,9 @@ def main(args: argparse.Namespace):
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.num_scheduler_steps,
args.use_v2_block_manager, args.download_dir, args.load_format,
args.disable_async_output_proc
args.disable_async_output_proc, args.weights_load_device,
args.use_padding_aware_scheduling, args.max_num_seqs,
args.max_num_prefill_seqs
]

if args.async_engine:
Expand Down Expand Up @@ -446,7 +463,7 @@ def main(args: argparse.Namespace):
parser.add_argument(
'--kv-cache-dtype',
type=str,
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'],
default="auto",
help='Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
Expand Down Expand Up @@ -540,6 +557,23 @@ def main(args: argparse.Namespace):
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument("--weights-load-device",
type=str,
default=None,
choices=DEVICE_OPTIONS,
help='Device on which weights are loaded.')
parser.add_argument("--use-padding-aware-scheduling",
action='store_true',
default=False,
help="Enable padding-aware scheduling.")
parser.add_argument("--max-num-seqs",
type=int,
default=256,
help="Maximum number of requests for single decode.")
parser.add_argument("--max-num-prefill-seqs",
type=int,
default=None,
help="Maximum number of requests for single prefill.")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
Expand Down

0 comments on commit acde882

Please sign in to comment.