-
-
Notifications
You must be signed in to change notification settings - Fork 5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PREFIX CACHING FOLLOW UP] OrderedDict-based evictor #3431
Merged
Merged
Changes from 6 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
95599c3
OrderedDict-based evictor
ElizaWszola b98277c
format
ElizaWszola 434e6bc
added benchmark expansion
robertgshaw2-neuralmagic 51409bf
format
ElizaWszola 8873203
ruff
ElizaWszola 84570cd
codespell
ElizaWszola 6bb58f5
Clean up benchmark to move to a different pr
40db779
Merge branch 'main' into ordered-evictor
ElizaWszola 39a9573
Post-merge fix
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,359 @@ | ||
# flake8: noqa | ||
# UPSTREAM SYNC: noqa is required for passing ruff run on nm-automation | ||
"""Benchmark online serving throughput. | ||
|
||
On the server side, run one of the following commands: | ||
(vLLM backend) | ||
python -m vllm.entrypoints.api_server \ | ||
--model <your_model> --swap-space 16 \ | ||
--disable-log-requests | ||
|
||
(TGI backend) | ||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens> | ||
|
||
On the client side, run: | ||
python benchmarks/benchmark_serving.py \ | ||
--backend <backend> \ | ||
--model <your_model> --dataset <target_dataset> \ | ||
--request-rate <request_rate> | ||
""" | ||
import argparse | ||
import asyncio | ||
import json | ||
import random | ||
import time | ||
from dataclasses import dataclass | ||
from datetime import datetime | ||
from typing import AsyncGenerator, List, Tuple | ||
|
||
import numpy as np | ||
from tqdm.asyncio import tqdm | ||
from transformers import PreTrainedTokenizerBase | ||
from vllm.transformers_utils.tokenizer import get_tokenizer | ||
|
||
from datasets_registry import ( | ||
get_dataset, | ||
DatasetArgs, | ||
) | ||
|
||
from backend_request_func import ( | ||
ASYNC_REQUEST_FUNCS, | ||
RequestFuncInput, | ||
RequestFuncOutput, | ||
) | ||
|
||
|
||
@dataclass | ||
class BenchmarkMetrics: | ||
completed: int | ||
total_input: int | ||
total_output: int | ||
request_throughput: float | ||
input_throughput: float | ||
output_throughput: float | ||
mean_ttft_ms: float | ||
median_ttft_ms: float | ||
p99_ttft_ms: float | ||
mean_tpot_ms: float | ||
median_tpot_ms: float | ||
p99_tpot_ms: float | ||
|
||
|
||
def sample_requests( | ||
dataset_name: str, | ||
num_requests: int, | ||
tokenizer: PreTrainedTokenizerBase, | ||
) -> List[Tuple[str, int, int]]: | ||
# Load the dataset. | ||
return get_dataset(name=dataset_name, | ||
tokenizer=tokenizer, | ||
dataset_args=DatasetArgs(num_samples=num_requests)) | ||
|
||
|
||
async def get_request( | ||
input_requests: List[Tuple[str, int, int]], | ||
request_rate: float, | ||
) -> AsyncGenerator[Tuple[str, int, int], None]: | ||
input_requests = iter(input_requests) | ||
for request in input_requests: | ||
yield request | ||
|
||
if request_rate == float("inf"): | ||
# If the request rate is infinity, then we don't need to wait. | ||
continue | ||
# Sample the request interval from the exponential distribution. | ||
interval = np.random.exponential(1.0 / request_rate) | ||
# The next request will be sent after the interval. | ||
await asyncio.sleep(interval) | ||
|
||
|
||
def calculate_metrics( | ||
input_requests: List[Tuple[str, int, int]], | ||
outputs: List[RequestFuncOutput], | ||
dur_s: float, | ||
tokenizer: PreTrainedTokenizerBase, | ||
) -> BenchmarkMetrics: | ||
total_output = 0 | ||
total_input = 0 | ||
completed = 0 | ||
per_token_latencies = [] | ||
ttfts = [] | ||
for i in range(len(outputs)): | ||
if outputs[i].success: | ||
output_len = len(tokenizer.encode(outputs[i].generated_text)) | ||
total_output += output_len | ||
total_input += input_requests[i][1] | ||
per_token_latencies.append( | ||
(outputs[i].latency - outputs[i].ttft) / output_len) | ||
ttfts.append(outputs[i].ttft) | ||
completed += 1 | ||
|
||
metrics = BenchmarkMetrics( | ||
completed=completed, | ||
total_input=total_input, | ||
total_output=total_output, | ||
request_throughput=completed / dur_s, | ||
input_throughput=total_input / dur_s, | ||
output_throughput=total_output / dur_s, | ||
mean_ttft_ms=np.mean(ttfts) * 1000, | ||
median_ttft_ms=np.median(ttfts) * 1000, | ||
p99_ttft_ms=np.percentile(ttfts, 99) * 1000, | ||
mean_tpot_ms=np.mean(per_token_latencies) * 1000, | ||
median_tpot_ms=np.median(per_token_latencies) * 1000, | ||
p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000, | ||
) | ||
|
||
return metrics | ||
|
||
|
||
async def benchmark( | ||
backend: str, | ||
api_url: str, | ||
model_id: str, | ||
tokenizer: PreTrainedTokenizerBase, | ||
input_requests: List[Tuple[str, int, int]], | ||
best_of: int, | ||
use_beam_search: bool, | ||
request_rate: float, | ||
disable_tqdm: bool, | ||
): | ||
if backend in ASYNC_REQUEST_FUNCS: | ||
request_func = ASYNC_REQUEST_FUNCS.get(backend) | ||
else: | ||
raise ValueError(f"Unknown backend: {backend}") | ||
|
||
print(f"Traffic request rate: {request_rate}") | ||
|
||
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) | ||
|
||
benchmark_start_time = time.perf_counter() | ||
tasks = [] | ||
async for request in get_request(input_requests, request_rate): | ||
prompt, prompt_len, output_len = request | ||
request_func_input = RequestFuncInput( | ||
model=model_id, | ||
prompt=prompt, | ||
api_url=api_url, | ||
prompt_len=prompt_len, | ||
output_len=output_len, | ||
best_of=best_of, | ||
use_beam_search=use_beam_search, | ||
) | ||
tasks.append( | ||
asyncio.create_task( | ||
request_func(request_func_input=request_func_input, | ||
pbar=pbar))) | ||
outputs = await asyncio.gather(*tasks) | ||
|
||
if not disable_tqdm: | ||
pbar.close() | ||
|
||
benchmark_duration = time.perf_counter() - benchmark_start_time | ||
|
||
metrics = calculate_metrics( | ||
input_requests=input_requests, | ||
outputs=outputs, | ||
dur_s=benchmark_duration, | ||
tokenizer=tokenizer, | ||
) | ||
|
||
print(f"Successful requests: {metrics.completed}") | ||
print(f"Benchmark duration: {benchmark_duration:2f} s") | ||
print(f"Total input tokens: {metrics.total_input}") | ||
print(f"Total generated tokens: {metrics.total_output}") | ||
print(f"Request throughput: {metrics.request_throughput:.2f} requests/s") | ||
print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s") | ||
print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s") | ||
print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms") | ||
print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms") | ||
print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms") | ||
print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms") | ||
print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms") | ||
print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms") | ||
|
||
result = { | ||
"duration": benchmark_duration, | ||
"completed": metrics.completed, | ||
"total_input_tokens": metrics.total_input, | ||
"total_output_tokens": metrics.total_output, | ||
"request_inthroughput": metrics.request_throughput, | ||
"input_throughput": metrics.input_throughput, | ||
"output_throughput": metrics.output_throughput, | ||
"mean_ttft_ms": metrics.mean_ttft_ms, | ||
"median_ttft_ms": metrics.median_ttft_ms, | ||
"p99_ttft_ms": metrics.p99_ttft_ms, | ||
"mean_tpot_ms": metrics.mean_tpot_ms, | ||
"median_tpot_ms": metrics.median_tpot_ms, | ||
"p99_tpot_ms": metrics.p99_tpot_ms | ||
} | ||
return result | ||
|
||
|
||
def main(args: argparse.Namespace): | ||
print(args) | ||
random.seed(args.seed) | ||
np.random.seed(args.seed) | ||
|
||
backend = args.backend | ||
model_id = args.model | ||
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model | ||
|
||
if args.base_url is not None: | ||
api_url = f"{args.base_url}{args.endpoint}" | ||
else: | ||
api_url = f"http://{args.host}:{args.port}{args.endpoint}" | ||
|
||
tokenizer = get_tokenizer(tokenizer_id, | ||
trust_remote_code=args.trust_remote_code) | ||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer) | ||
|
||
benchmark_result = asyncio.run( | ||
benchmark( | ||
backend=backend, | ||
api_url=api_url, | ||
model_id=model_id, | ||
tokenizer=tokenizer, | ||
input_requests=input_requests, | ||
best_of=args.best_of, | ||
use_beam_search=args.use_beam_search, | ||
request_rate=args.request_rate, | ||
disable_tqdm=args.disable_tqdm, | ||
)) | ||
|
||
# Save config and results to json | ||
if args.save_result: | ||
result_json = {} | ||
|
||
# Setup | ||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") | ||
result_json["date"] = current_dt | ||
result_json["backend"] = backend | ||
result_json["version"] = args.version | ||
result_json["model_id"] = model_id | ||
result_json["tokenizer_id"] = tokenizer_id | ||
result_json["best_of"] = args.best_of | ||
result_json["use_beam_search"] = args.use_beam_search | ||
result_json["num_prompts"] = args.num_prompts | ||
|
||
# Traffic | ||
result_json["request_rate"] = ( | ||
args.request_rate if args.request_rate < float("inf") else "inf") | ||
|
||
# Merge with benchmark result | ||
result_json = {**result_json, **benchmark_result} | ||
|
||
# Save to file | ||
base_model_id = model_id.split("/")[-1] | ||
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" | ||
with open(file_name, "w") as outfile: | ||
json.dump(result_json, outfile) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Benchmark the online serving throughput.") | ||
parser.add_argument( | ||
"--backend", | ||
type=str, | ||
default="vllm", | ||
choices=list(ASYNC_REQUEST_FUNCS.keys()), | ||
) | ||
parser.add_argument( | ||
"--version", | ||
type=str, | ||
default="N/A", | ||
help="Version of the serving backend/engine.", | ||
) | ||
parser.add_argument( | ||
"--base-url", | ||
type=str, | ||
default=None, | ||
help="Server or API base url if not using http host and port.", | ||
) | ||
parser.add_argument("--host", type=str, default="localhost") | ||
parser.add_argument("--port", type=int, default=8000) | ||
parser.add_argument( | ||
"--endpoint", | ||
type=str, | ||
default="/generate", | ||
help="API endpoint.", | ||
) | ||
parser.add_argument("--dataset", | ||
type=str, | ||
choices=["sharegpt", "ultrachat", "sonnet"], | ||
required=True, | ||
help="Path to the dataset.") | ||
parser.add_argument( | ||
"--model", | ||
type=str, | ||
required=True, | ||
help="Name of the model.", | ||
) | ||
parser.add_argument( | ||
"--tokenizer", | ||
type=str, | ||
help= | ||
"Name or path of the tokenizer, if not using the default model tokenizer.", | ||
) | ||
parser.add_argument( | ||
"--best-of", | ||
type=int, | ||
default=1, | ||
help="Generates `best_of` sequences per prompt and " | ||
"returns the best one.", | ||
) | ||
parser.add_argument("--use-beam-search", action="store_true") | ||
parser.add_argument( | ||
"--num-prompts", | ||
type=int, | ||
default=1000, | ||
help="Number of prompts to process.", | ||
) | ||
parser.add_argument( | ||
"--request-rate", | ||
type=float, | ||
default=float("inf"), | ||
help="Number of requests per second. If this is inf, " | ||
"then all the requests are sent at time 0. " | ||
"Otherwise, we use Poisson process to synthesize " | ||
"the request arrival times.", | ||
) | ||
parser.add_argument("--seed", type=int, default=0) | ||
parser.add_argument( | ||
"--trust-remote-code", | ||
action="store_true", | ||
help="Trust remote code from huggingface", | ||
) | ||
parser.add_argument( | ||
"--disable-tqdm", | ||
action="store_true", | ||
help="Specify to disable tqdm progress bar.", | ||
) | ||
parser.add_argument( | ||
"--save-result", | ||
action="store_true", | ||
help="Specify to save benchmark results to a json file", | ||
) | ||
|
||
args = parser.parse_args() | ||
main(args) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the difference between this and old
benchmark_serving.py
? If it's just WIP code it's fine. Otherwise let's include this in another PR?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll make a separate PR that rewrites
benchmark_serving.py
a bit.