From e9e415bb6f45c118b92b3314cf9bb53eb2443231 Mon Sep 17 00:00:00 2001 From: Sangjin Choi Date: Thu, 17 Oct 2024 12:33:57 +0900 Subject: [PATCH] baseline scripts --- baseline/baseline.py | 245 +++++++++++++++++ baseline/baseline_adaptive.py | 348 ++++++++++++++++++++++++ baseline/baseline_adaptive.sh | 1 + baseline/baseline_ar.py | 250 +++++++++++++++++ baseline/baseline_ar_chunked_prefill.py | 238 ++++++++++++++++ baseline/chunked_prefill.sh | 130 +++++++++ baseline/run_all.sh | 190 +++++++++++++ baseline/run_all_A100.sh | 190 +++++++++++++ 8 files changed, 1592 insertions(+) create mode 100644 baseline/baseline.py create mode 100644 baseline/baseline_adaptive.py create mode 100755 baseline/baseline_adaptive.sh create mode 100644 baseline/baseline_ar.py create mode 100644 baseline/baseline_ar_chunked_prefill.py create mode 100755 baseline/chunked_prefill.sh create mode 100755 baseline/run_all.sh create mode 100755 baseline/run_all_A100.sh diff --git a/baseline/baseline.py b/baseline/baseline.py new file mode 100644 index 0000000000000..35eb9574f01a9 --- /dev/null +++ b/baseline/baseline.py @@ -0,0 +1,245 @@ +"""Benchmark offline inference throughput.""" + +import argparse +import random +import time +import gc +from itertools import cycle +from typing import List, Optional, Tuple +import json + +import numpy as np +from tabulate import tabulate +from transformers import AutoTokenizer +import torch +from dataset import sample_requests + +from vllm import LLM, SamplingParams +from vllm.outputs import RequestOutput +from transformers import PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer import get_tokenizer + +# Constants +DOWNLOAD_DIR = '/mnt/sda/download' +BENCHMARK_DURATION_IN_MINUTES = 5 + +# Disable garbage collection for performance +gc.disable() + +if torch.cuda.is_available(): + gpu_index = 0 # First GPU + gpu_name = torch.cuda.get_device_name(gpu_index) + print(gpu_name) +else: + print("No CUDA device available") + +def get_requests_with_time(input_requests: List[Tuple[str, int, int]], + request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]: + """Generates requests with associated times based on a Poisson process.""" + requests_with_time = [] + current_time = 0.0 + + for request in cycle(input_requests): + requests_with_time.append((current_time, request)) + interval = np.random.exponential(1.0 / request_rate) + current_time += interval + + # Add 1 minute to the benchmark duration for safety + if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60: + break + + return requests_with_time + + +def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]: + """Runs the benchmark, processing requests with the given LLM.""" + requests_with_time = get_requests_with_time(requests, request_rate) + outputs: List[RequestOutput] = [] + result = {} + + start_time = time.perf_counter() + + request_index = 0 + while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60: + current_time = time.perf_counter() - start_time + + # Add requests to the engine if their scheduled time has passed + while requests_with_time[request_index][0] <= current_time: + request_start_time, (prompt, prompt_len, + output_len) = requests_with_time[request_index] + sampling_params = SamplingParams( + n=1, + temperature=random.choice( + [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature, + top_p=1.0, + use_beam_search=False, + ignore_eos=True, + max_tokens=output_len, + ) + request_id = llm._add_request( + inputs=prompt, params=sampling_params) + result[str(request_id)] = [request_start_time] + request_index += 1 + + step_outputs = llm.llm_engine.step() + for output in step_outputs: + if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1: + ttft = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].append(ttft) + + if output.finished: + e2e_latency = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].extend( + [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)]) + outputs.append(output) + + throughput = len(outputs) / (time.perf_counter() - start_time) + print(f"Throughput: {throughput:.3f} reqs/s") + + # remove request_id from result if not exist in outputs + for request_id in list(result.keys()): + if request_id not in [output.request_id for output in outputs]: + del result[request_id] + + total_tokens = sum(prompt_len + output_len for _, _, _, + prompt_len, output_len in result.values()) + + return result, total_tokens + +def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Analyzes the results to compute TTFT, TPOT, and token latencies.""" + ttfts, tpots, token_latencies = [], [], [] + + for _, values in result.items(): + _, ttft, e2e_latency, _, output_len = values + ttfts.append(ttft) + tpots.append((e2e_latency - ttft) / (output_len - 1)) + token_latencies.append(e2e_latency / output_len) + + return np.array(ttfts), np.array(tpots), np.array(token_latencies) + + +def main(args: argparse.Namespace): + random.seed(args.seed) + + # Display configuration tables + config_table = [ + ["Target Model", args.target_model], + ["Draft Model", args.draft_model], + ["Draft Size", args.draft_size], + ["Temperature", args.temperature], + ["Colocate", args.colocate], + ["Prefill Schedule Mode", args.prefill_schedule_mode], + ["Budget Token", args.budget_token], + ["Budget Seq", args.budget_seq], + ["Drop Threshold", args.drop_threshold], + ["Target Attention", args.target_attention], + ["Dataset", args.dataset], + ["Request Rate", args.request_rate], + ] + print(tabulate(config_table)) + llm = LLM( + model=args.target_model, + speculative_model=args.draft_model, + num_speculative_tokens=args.draft_size, + use_v2_block_manager=True, + gpu_memory_utilization=0.85, + ) + + # Sample the requests + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + # 0 indicates all requests + requests = sample_requests(args.dataset, 0, tokenizer) + + # Run the benchmark + start_time = time.perf_counter() + result, total_tokens = run( + llm, requests, args.request_rate, args.temperature) + elapsed_time = time.perf_counter() - start_time + + # Analyze results + ttfts, tpots, token_latencies = analyze_results(result) + + # Main results + request_throughput = len(result) / elapsed_time + token_throughput = total_tokens / elapsed_time + # token_latency: the average processing time per output token + token_latency = np.mean(token_latencies) + + # Sub results + p50_ttft = np.percentile(ttfts, 50) + p99_ttft = np.percentile(ttfts, 99) + p50_tpot = np.percentile(tpots, 50) + p99_tpot = np.percentile(tpots, 99) + p50_token_latency = np.percentile(token_latencies, 50) + p99_token_latency = np.percentile(token_latencies, 99) + + #remove spaces in gpu_name + gpu_index = 0 # First GPU + gpu_name = torch.cuda.get_device_name(gpu_index) + gpu_name = gpu_name.replace(" ", "") + + # print("GPU Name,Target Model, Draft Model, Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size") + # print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency},False") + print(f"Result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, False") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--dataset", type=str, default='sharegpt', + help="Path to the dataset.") + parser.add_argument("--input-len", type=int, default=None, + help="Input prompt length for each request.") + parser.add_argument("--output-len", type=int, default=None, + help="Output length for each request. Overrides the output length from the dataset.") + parser.add_argument('--target-model', type=str, + default='facebook/opt-6.7b') + parser.add_argument('--draft-model', type=str, default='facebook/opt-125m') + parser.add_argument('--draft-size', type=int, default=4) + parser.add_argument('--temperature', type=float, default=0.0, + help="Temperature for sampling. -1 for random temperature.") + parser.add_argument('--colocate', '-c', action='store_true') + parser.add_argument('--prefill-schedule-mode', '-psm', choices=[ + 'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill') + parser.add_argument("--target-attention", + action="store_true", help="Use target attention.") + parser.add_argument("--drop-threshold", '-dt', type=float, + default=0, help="Threshold for dropping token.") + parser.add_argument('--budget-token', type=int, default=2048, + help='Maximum number of tokens for each batch.') + parser.add_argument('--budget-seq', type=int, default=64, + help='Maximum number of sequences for each request.') + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument('--quantization', '-q', + choices=['awq', 'gptq', 'squeezellm', None], default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--n", type=int, default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-prompts", type=int, default=1000, + help="Number of prompts to process.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--hf-max-batch-size", type=int, + default=None, help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', action='store_true', + help='Trust remote code from Hugging Face.') + parser.add_argument('--max-model-len', type=int, default=None, + help='Maximum length of a sequence (including prompt and output).') + parser.add_argument('--dtype', type=str, default='auto', choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.') + parser.add_argument("--enforce-eager", action="store_true", + help="Enforce eager execution.") + parser.add_argument("--request-rate", type=float, + default=4, help="Number of requests per second.") + parser.add_argument("--speculative-disable-by-batch-size", type=int, default=4) + + args = parser.parse_args() + + if args.tokenizer is None: + args.tokenizer = args.target_model + if args.dataset is None or args.dataset == "dummy": + args.dataset = "dummy" + + main(args) \ No newline at end of file diff --git a/baseline/baseline_adaptive.py b/baseline/baseline_adaptive.py new file mode 100644 index 0000000000000..eb24f9911a877 --- /dev/null +++ b/baseline/baseline_adaptive.py @@ -0,0 +1,348 @@ +import argparse +import random +import time +import gc +from itertools import cycle +from typing import List, Optional, Tuple + +import numpy as np +from tabulate import tabulate +from transformers import AutoTokenizer +from dataset import sample_requests + +from vllm import LLM, SamplingParams +from vllm.outputs import RequestOutput +from transformers import PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer import get_tokenizer + +# Constants +DOWNLOAD_DIR = '/mnt/sda/download' + +# Disable garbage collection for performance +gc.disable() + +# Global variables for benchmark duration +BENCHMARK_DURATION_IN_MINUTES = 3 + +def get_requests_with_time(input_requests: List[Tuple[str, int, int]], + high_request_rate: float, + mid_request_rate: float, + low_request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]: + """ + Generates requests with associated times based on a custom request rate pattern: + Long Low Rate -> Mid Rate -> Short High Rate -> Mid Rate -> Long Low Rate + """ + requests_with_time = [] + current_time = 0.0 + total_duration = BENCHMARK_DURATION_IN_MINUTES * 60 # Total benchmark duration in seconds + + # Define the durations for each phase + phase_durations = [ + total_duration / 5, # Phase 1: Low Rate + total_duration / 5, # Phase 2: Mid Rate + total_duration / 5, # Phase 3: High Rate + total_duration / 5, # Phase 4: Mid Rate + total_duration / 5 # Phase 5: Low Rate + ] + + # Define the phases with their corresponding request rates + phases = [ + (low_request_rate, phase_durations[0]), # Phase 1: Low Rate + (mid_request_rate, phase_durations[1]), # Phase 2: Mid Rate + (high_request_rate, phase_durations[2]), # Phase 3: High Rate + (mid_request_rate, phase_durations[3]), # Phase 4: Mid Rate + (low_request_rate, phase_durations[4]), # Phase 5: Low Rate + ] + + phase_index = 0 + current_request_rate, phase_duration = phases[phase_index] + time_period_end = phase_duration + + for request in cycle(input_requests): + # Generate inter-arrival time based on current request rate + if current_request_rate > 0: + interval = np.random.exponential(1.0 / current_request_rate) + else: + interval = float('inf') + current_time += interval + + # Update request rate based on time + if current_time > time_period_end and phase_index < len(phases) - 1: + phase_index += 1 + current_request_rate, phase_duration = phases[phase_index] + time_period_end += phase_duration # Accumulate the durations for time thresholds + + if current_time > total_duration: + break + + requests_with_time.append((current_time, request)) + + return requests_with_time + +def warmup(llm): + sampling_params = SamplingParams( + n=1, + temperature=0, + top_p=1.0, + use_beam_search=False, + ignore_eos=True, + max_tokens=128, + ) + dummy_prompt_token_ids = [[0] * 32] * 8 + start_time = time.perf_counter() + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + +def run(llm: LLM, requests: List[Tuple[str, int, int]], + high_request_rate: float, mid_request_rate: float, low_request_rate: float, + temperature: float) -> Tuple[dict, int, bool, List[Tuple[float, float]], List[Tuple[float, float]]]: + """Runs the benchmark, processing requests with the given LLM.""" + requests_with_time = get_requests_with_time(requests, high_request_rate, mid_request_rate, low_request_rate) + outputs: List[RequestOutput] = [] + result = {} + token_latencies_over_time = [] + token_throughput_over_time = [] # New: To store token throughput over time + requests_over_time = [] + + interval_duration = 10.0 # Interval duration in seconds + next_interval_time = interval_duration # Time when the next interval ends + tokens_in_interval = 0 # Tokens processed in the current interval + + start_time = time.perf_counter() + + total_processed_tokens = 0 + request_index = 0 + while True: + current_time = time.perf_counter() - start_time + + # Add requests to the engine if their scheduled time has passed + while request_index < len(requests_with_time) and requests_with_time[request_index][0] <= current_time: + request_start_time, (prompt, prompt_len, + output_len) = requests_with_time[request_index] + sampling_params = SamplingParams( + n=1, + temperature=random.choice( + [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature, + top_p=1.0, + use_beam_search=False, + ignore_eos=True, + max_tokens=output_len, + ) + request_id = llm._add_request( + inputs=prompt, params=sampling_params) + result[request_id] = [request_start_time] + request_index += 1 + + step_outputs = llm.llm_engine.step() + for output in step_outputs: + if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1: + ttft = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].append(ttft) + + if output.finished: + current_time_in_run = time.perf_counter() - start_time + e2e_latency = current_time_in_run - \ + result[output.request_id][0] + result[output.request_id].extend( + [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)]) + outputs.append(output) + num_tokens = len(output.prompt_token_ids) + len(output.outputs[0].token_ids) + total_processed_tokens += num_tokens + tokens_in_interval += num_tokens # Update tokens in the current interval + + # Calculate token latency + token_latency = e2e_latency / len(output.outputs[0].token_ids) + token_latencies_over_time.append((current_time_in_run, token_latency)) + + num_request = len(step_outputs) + requests_over_time.append((time.perf_counter() - start_time, num_request)) + + # Check if we've reached the end of the current interval + if current_time >= next_interval_time: + interval_throughput = tokens_in_interval / interval_duration + token_throughput_over_time.append((next_interval_time, interval_throughput)) + tokens_in_interval = 0 # Reset token count for next interval + next_interval_time += interval_duration + print(f"Token Throughput at {next_interval_time}s: {interval_throughput:.3f} tokens/s") + + throughput = total_processed_tokens / \ + (time.perf_counter() - start_time) + print(f"Throughput: {throughput:.3f} tokens/s") + + if not llm.llm_engine.has_unfinished_requests() and (current_time > BENCHMARK_DURATION_IN_MINUTES * 60): + break + + # Remove request_id from result if not exist in outputs + for request_id in list(result.keys()): + if request_id not in [output.request_id for output in outputs]: + del result[request_id] + + total_tokens = sum(prompt_len + output_len for _, _, _, + prompt_len, output_len in result.values()) + + return result, total_tokens, token_latencies_over_time, token_throughput_over_time, requests_over_time # Return the throughput data + +def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Analyzes the results to compute TTFT, TPOT, and token latencies.""" + ttfts, tpots, token_latencies = [], [], [] + + for _, values in result.items(): + _, ttft, e2e_latency, _, output_len = values + ttfts.append(ttft) + tpots.append((e2e_latency - ttft) / (output_len - 1)) + token_latencies.append(e2e_latency / output_len) + + return np.array(ttfts), np.array(tpots), np.array(token_latencies) + +def main(args: argparse.Namespace): + random.seed(args.seed) + + # Display configuration tables + config_table = [ + ["Target Model", args.target_model], + ["Draft Model", args.draft_model], + ["Draft Size", args.draft_size], + ["Temperature", args.temperature], + ["Colocate", args.colocate], + ["Prefill Schedule Mode", args.prefill_schedule_mode], + ["Budget Token", args.budget_token], + ["Budget Seq", args.budget_seq], + ["Selective Validation", args.selective_validation], + ["Drop Threshold", args.drop_threshold], + ["Consolidated Attention", args.consolidated_attention], + ["Dataset", args.dataset], + ["High Request Rate", args.high_request_rate], + ["Mid Request Rate", args.mid_request_rate], + ["Low Request Rate", args.low_request_rate], + ["Benchmark Duration (min)", args.benchmark_duration], + ] + print(tabulate(config_table)) + + global BENCHMARK_DURATION_IN_MINUTES + BENCHMARK_DURATION_IN_MINUTES = args.benchmark_duration + + llm = LLM( + model=args.target_model, + gpu_memory_utilization=0.85, + ) + + # Sample the requests + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + + warmup(llm) + + # 0 indicates all requests + requests = sample_requests(args.dataset, 0, tokenizer) + # Run the benchmark + start_time = time.perf_counter() + result, total_tokens, token_latencies_over_time, token_throughput_over_time, requests_over_time = run( + llm, requests, args.high_request_rate, args.mid_request_rate, args.low_request_rate, args.temperature) + elapsed_time = time.perf_counter() - start_time + + # Analyze results + ttfts, tpots, token_latencies = analyze_results(result) + + # Main results + request_throughput = len(result) / elapsed_time + token_throughput = total_tokens / elapsed_time + # token_latency: the average processing time per output token + token_latency = np.mean(token_latencies) + + # Sub results + p50_ttft = np.percentile(ttfts, 50) + p99_ttft = np.percentile(ttfts, 99) + p50_tpot = np.percentile(tpots, 50) + p99_tpot = np.percentile(tpots, 99) + p50_token_latency = np.percentile(token_latencies, 50) + p99_token_latency = np.percentile(token_latencies, 99) + + # Print all results in csv format + # print("Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token)," + # "P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token)," + # "Preempt Flag") + # print(f"result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, {preempt_flag}") + + file_name_prefix = f"AR" + + # Write token latencies over time to a CSV file + with open(f'token_latencies_over_time_{file_name_prefix}.csv', 'w') as f: + f.write('Time(s),Token_Latency(s/token)\n') + for time_point, latency in token_latencies_over_time: + f.write(f"{time_point},{latency}\n") + + # Write token throughput over time to a CSV file + with open(f'token_throughput_over_time_{file_name_prefix}.csv', 'w') as f: + f.write('Time(s),Token_Throughput(tokens/s)\n') + for time_point, throughput in token_throughput_over_time: + f.write(f"{time_point},{throughput}\n") + + with open(f'requests_over_time_{file_name_prefix}.csv', 'w') as f: + f.write('Time(s),Num_Requests\n') + for time_point, num_requests in requests_over_time: + f.write(f"{time_point},{num_requests}\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--dataset", type=str, default=None, + help="Path to the dataset.") + parser.add_argument('--target-model', type=str, + default='facebook/opt-6.7b') + parser.add_argument('--draft-model', type=str, default='facebook/opt-125m') + parser.add_argument('--draft-size', type=int, default=0) + parser.add_argument('--temperature', type=float, default=0.0, + help="Temperature for sampling. -1 for random temperature in [0, 0.25, 0.5, 0.75].") + parser.add_argument('--colocate', '-c', action='store_true') + parser.add_argument('--prefill-schedule-mode', '-psm', choices=[ + 'full_prefill', 'chunked_prefill'], default='full_prefill') + parser.add_argument("--consolidated-attention", + action="store_true", help="Use consolidated attention.") + parser.add_argument("--selective-validation", action="store_true") + parser.add_argument("--drop-threshold", '-dt', type=float, + default=0, help="Threshold for dropping token.") + parser.add_argument('--budget-token', type=int, default=2048, + help='Maximum number of tokens for each batch.') + parser.add_argument('--budget-seq', type=int, default=128, + help='Maximum number of sequences for each request.') + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument('--quantization', '-q', + choices=['awq', 'gptq', 'squeezellm', None], default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--n", type=int, default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-prompts", type=int, default=1000, + help="Number of prompts to process.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--hf-max-batch-size", type=int, + default=None, help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', action='store_true', + help='Trust remote code from Hugging Face.') + parser.add_argument('--max-model-len', type=int, default=None, + help='Maximum length of a sequence (including prompt and output).') + parser.add_argument('--dtype', type=str, default='auto', choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.') + parser.add_argument("--enforce-eager", action="store_true", + help="Enforce eager execution.") + parser.add_argument('--benchmark-duration', type=int, default=5, + help='Benchmark duration in minutes.') + parser.add_argument('--low-request-rate', type=float, default=6, + help='Low request rate for changing arrival pattern.') + parser.add_argument('--mid-request-rate', type=float, default=12, + help='Mid request rate for changing arrival pattern.') + parser.add_argument('--high-request-rate', type=float, default=24, + help='High request rate for changing arrival pattern.') + + args = parser.parse_args() + + if args.tokenizer is None: + args.tokenizer = args.target_model + + assert args.dataset is not None + + main(args) diff --git a/baseline/baseline_adaptive.sh b/baseline/baseline_adaptive.sh new file mode 100755 index 0000000000000..f7602bbea1711 --- /dev/null +++ b/baseline/baseline_adaptive.sh @@ -0,0 +1 @@ +python baseline_adaptive.py --dataset gsm8k --temperature 0.5 --prefill-schedule-mode full_prefill --budget-token 4096 --budget-seq 256 --target-model facebook/opt-6.7b --draft-model facebook/opt-125m diff --git a/baseline/baseline_ar.py b/baseline/baseline_ar.py new file mode 100644 index 0000000000000..2bbf7852ab3ec --- /dev/null +++ b/baseline/baseline_ar.py @@ -0,0 +1,250 @@ +"""Benchmark offline inference throughput.""" + +import argparse +import random +import time +import gc +from itertools import cycle +from typing import List, Optional, Tuple +import json + +import numpy as np +from tabulate import tabulate +from transformers import AutoTokenizer +import torch +from dataset import sample_requests + +from vllm import LLM, SamplingParams +from vllm.outputs import RequestOutput +from transformers import PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer import get_tokenizer + +# Constants +DOWNLOAD_DIR = '/mnt/sda/download' +BENCHMARK_DURATION_IN_MINUTES = 5 + +# Disable garbage collection for performance +gc.disable() + +if torch.cuda.is_available(): + gpu_index = 0 # First GPU + gpu_name = torch.cuda.get_device_name(gpu_index) + print(gpu_name) +else: + print("No CUDA device available") + +def get_requests_with_time(input_requests: List[Tuple[str, int, int]], + request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]: + """Generates requests with associated times based on a Poisson process.""" + requests_with_time = [] + current_time = 0.0 + + for request in cycle(input_requests): + requests_with_time.append((current_time, request)) + interval = np.random.exponential(1.0 / request_rate) + current_time += interval + + # Add 1 minute to the benchmark duration for safety + if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60: + break + + return requests_with_time + + +def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]: + """Runs the benchmark, processing requests with the given LLM.""" + requests_with_time = get_requests_with_time(requests, request_rate) + outputs: List[RequestOutput] = [] + result = {} + + start_time = time.perf_counter() + + request_index = 0 + while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60: + current_time = time.perf_counter() - start_time + + # Add requests to the engine if their scheduled time has passed + while requests_with_time[request_index][0] <= current_time: + request_start_time, (prompt, prompt_len, + output_len) = requests_with_time[request_index] + sampling_params = SamplingParams( + n=1, + temperature=random.choice( + [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature, + top_p=1.0, + use_beam_search=False, + ignore_eos=True, + max_tokens=output_len, + ) + request_id = llm._add_request( + inputs=prompt, params=sampling_params) + result[str(request_id)] = [request_start_time] + request_index += 1 + + step_outputs = llm.llm_engine.step() + for output in step_outputs: + if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1: + ttft = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].append(ttft) + + if output.finished: + e2e_latency = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].extend( + [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)]) + outputs.append(output) + + throughput = len(outputs) / (time.perf_counter() - start_time) + print(f"Throughput: {throughput:.3f} reqs/s") + + # remove request_id from result if not exist in outputs + for request_id in list(result.keys()): + if request_id not in [output.request_id for output in outputs]: + del result[request_id] + + total_tokens = sum(prompt_len + output_len for _, _, _, + prompt_len, output_len in result.values()) + + return result, total_tokens + + +def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Analyzes the results to compute TTFT, TPOT, and token latencies.""" + ttfts, tpots, token_latencies = [], [], [] + + for _, values in result.items(): + _, ttft, e2e_latency, _, output_len = values + ttfts.append(ttft) + tpots.append((e2e_latency - ttft) / (output_len - 1)) + token_latencies.append(e2e_latency / output_len) + + return np.array(ttfts), np.array(tpots), np.array(token_latencies) + + +def main(args: argparse.Namespace): + random.seed(args.seed) + + # Display configuration tables + config_table = [ + ["Target Model", args.target_model], + ["Draft Model", args.draft_model], + ["Draft Size", args.draft_size], + ["Temperature", args.temperature], + ["Colocate", args.colocate], + ["Prefill Schedule Mode", args.prefill_schedule_mode], + ["Budget Token", args.budget_token], + ["Budget Seq", args.budget_seq], + ["Drop Threshold", args.drop_threshold], + ["Target Attention", args.target_attention], + ["Dataset", args.dataset], + ["Request Rate", args.request_rate], + ] + print(tabulate(config_table)) + llm = LLM( + model=args.target_model, + gpu_memory_utilization=0.85, + ) + + # Sample the requests + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + # 0 indicates all requests + requests = sample_requests(args.dataset, 0, tokenizer) + + # Run the benchmark + start_time = time.perf_counter() + result, total_tokens = run( + llm, requests, args.request_rate, args.temperature) + elapsed_time = time.perf_counter() - start_time + + # Analyze results + ttfts, tpots, token_latencies = analyze_results(result) + + # Main results + request_throughput = len(result) / elapsed_time + token_throughput = total_tokens / elapsed_time + # token_latency: the average processing time per output token + token_latency = np.mean(token_latencies) + + # Sub results + p50_ttft = np.percentile(ttfts, 50) + p99_ttft = np.percentile(ttfts, 99) + p50_tpot = np.percentile(tpots, 50) + p99_tpot = np.percentile(tpots, 99) + p50_token_latency = np.percentile(token_latencies, 50) + p99_token_latency = np.percentile(token_latencies, 99) + + #remove spaces in gpu_name + gpu_index = 0 # First GPU + gpu_name = torch.cuda.get_device_name(gpu_index) + gpu_name = gpu_name.replace(" ", "") + + # print("GPU Name,Target Model, Draft Model, Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size") + # print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency},False") + + + # # Print all results in csv format + # print("Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token)," + # "P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token)," + # "Preempt Flag") + print(f"Result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, False") + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--dataset", type=str, default='sharegpt', + help="Path to the dataset.") + parser.add_argument("--input-len", type=int, default=None, + help="Input prompt length for each request.") + parser.add_argument("--output-len", type=int, default=None, + help="Output length for each request. Overrides the output length from the dataset.") + parser.add_argument('--target-model', type=str, + default='facebook/opt-6.7b') + parser.add_argument('--draft-model', type=str, default='facebook/opt-125m') + parser.add_argument('--draft-size', type=int, default=4) + parser.add_argument('--temperature', type=float, default=0.0, + help="Temperature for sampling. -1 for random temperature.") + parser.add_argument('--colocate', '-c', action='store_true') + parser.add_argument('--prefill-schedule-mode', '-psm', choices=[ + 'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill') + parser.add_argument("--target-attention", + action="store_true", help="Use target attention.") + parser.add_argument("--drop-threshold", '-dt', type=float, + default=0, help="Threshold for dropping token.") + parser.add_argument('--budget-token', type=int, default=2048, + help='Maximum number of tokens for each batch.') + parser.add_argument('--budget-seq', type=int, default=64, + help='Maximum number of sequences for each request.') + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument('--quantization', '-q', + choices=['awq', 'gptq', 'squeezellm', None], default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--n", type=int, default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-prompts", type=int, default=1000, + help="Number of prompts to process.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--hf-max-batch-size", type=int, + default=None, help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', action='store_true', + help='Trust remote code from Hugging Face.') + parser.add_argument('--max-model-len', type=int, default=None, + help='Maximum length of a sequence (including prompt and output).') + parser.add_argument('--dtype', type=str, default='auto', choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.') + parser.add_argument("--enforce-eager", action="store_true", + help="Enforce eager execution.") + parser.add_argument("--request-rate", type=float, + default=4, help="Number of requests per second.") + + args = parser.parse_args() + + if args.tokenizer is None: + args.tokenizer = args.target_model + if args.dataset is None or args.dataset == "dummy": + args.dataset = "dummy" + + main(args) \ No newline at end of file diff --git a/baseline/baseline_ar_chunked_prefill.py b/baseline/baseline_ar_chunked_prefill.py new file mode 100644 index 0000000000000..34c14ab381f0d --- /dev/null +++ b/baseline/baseline_ar_chunked_prefill.py @@ -0,0 +1,238 @@ +"""Benchmark offline inference throughput.""" + +import argparse +import random +import time +import gc +from itertools import cycle +from typing import List, Optional, Tuple +import json + +import numpy as np +from tabulate import tabulate +from transformers import AutoTokenizer +import torch +from dataset import sample_requests + +from vllm import LLM, SamplingParams +from vllm.outputs import RequestOutput +from transformers import PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer import get_tokenizer + +# Constants +DOWNLOAD_DIR = '/mnt/sda/download' +BENCHMARK_DURATION_IN_MINUTES = 5 + +# Disable garbage collection for performance +gc.disable() + +if torch.cuda.is_available(): + gpu_index = 0 # First GPU + gpu_name = torch.cuda.get_device_name(gpu_index) + print(gpu_name) +else: + print("No CUDA device available") + +def get_requests_with_time(input_requests: List[Tuple[str, int, int]], + request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]: + """Generates requests with associated times based on a Poisson process.""" + requests_with_time = [] + current_time = 0.0 + + for request in cycle(input_requests): + requests_with_time.append((current_time, request)) + interval = np.random.exponential(1.0 / request_rate) + current_time += interval + + # Add 1 minute to the benchmark duration for safety + if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60: + break + + return requests_with_time + + +def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]: + """Runs the benchmark, processing requests with the given LLM.""" + requests_with_time = get_requests_with_time(requests, request_rate) + outputs: List[RequestOutput] = [] + result = {} + + start_time = time.perf_counter() + + request_index = 0 + while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60: + current_time = time.perf_counter() - start_time + + # Add requests to the engine if their scheduled time has passed + while requests_with_time[request_index][0] <= current_time: + request_start_time, (prompt, prompt_len, + output_len) = requests_with_time[request_index] + sampling_params = SamplingParams( + n=1, + temperature=random.choice( + [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature, + top_p=1.0, + use_beam_search=False, + ignore_eos=True, + max_tokens=output_len, + ) + request_id = llm._add_request( + inputs=prompt, params=sampling_params) + result[str(request_id)] = [request_start_time] + request_index += 1 + + step_outputs = llm.llm_engine.step() + for output in step_outputs: + if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1: + ttft = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].append(ttft) + + if output.finished: + e2e_latency = time.perf_counter() - start_time - \ + result[output.request_id][0] + result[output.request_id].extend( + [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)]) + outputs.append(output) + + throughput = len(outputs) / (time.perf_counter() - start_time) + print(f"Throughput: {throughput:.3f} reqs/s") + + # remove request_id from result if not exist in outputs + for request_id in list(result.keys()): + if request_id not in [output.request_id for output in outputs]: + del result[request_id] + + total_tokens = sum(prompt_len + output_len for _, _, _, + prompt_len, output_len in result.values()) + + return result, total_tokens + + +def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Analyzes the results to compute TTFT, TPOT, and token latencies.""" + ttfts, tpots, token_latencies = [], [], [] + + for _, values in result.items(): + _, ttft, e2e_latency, _, output_len = values + ttfts.append(ttft) + tpots.append((e2e_latency - ttft) / (output_len - 1)) + token_latencies.append(e2e_latency / output_len) + + return np.array(ttfts), np.array(tpots), np.array(token_latencies) + + +def main(args: argparse.Namespace): + random.seed(args.seed) + + # Display configuration tables + config_table = [ + ["Target Model", args.target_model], + ["Draft Model", args.draft_model], + ["Draft Size", args.draft_size], + ["Temperature", args.temperature], + ["Colocate", args.colocate], + ["Prefill Schedule Mode", args.prefill_schedule_mode], + ["Budget Token", args.budget_token], + ["Budget Seq", args.budget_seq], + ["Drop Threshold", args.drop_threshold], + ["Target Attention", args.target_attention], + ["Dataset", args.dataset], + ["Request Rate", args.request_rate], + ] + print(tabulate(config_table)) + llm = LLM( + model=args.target_model, + gpu_memory_utilization=0.85, + enable_chunked_prefill=True, + max_num_seqs=args.budget_seq, + ) + + # Sample the requests + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + # 0 indicates all requests + requests = sample_requests(args.dataset, 0, tokenizer) + + # Run the benchmark + start_time = time.perf_counter() + result, total_tokens = run( + llm, requests, args.request_rate, args.temperature) + elapsed_time = time.perf_counter() - start_time + + # Analyze results + ttfts, tpots, token_latencies = analyze_results(result) + + # Main results + request_throughput = len(result) / elapsed_time + token_throughput = total_tokens / elapsed_time + # token_latency: the average processing time per output token + token_latency = np.mean(token_latencies) + + # Sub results + p50_ttft = np.percentile(ttfts, 50) + p99_ttft = np.percentile(ttfts, 99) + p50_tpot = np.percentile(tpots, 50) + p99_tpot = np.percentile(tpots, 99) + p50_token_latency = np.percentile(token_latencies, 50) + p99_token_latency = np.percentile(token_latencies, 99) + + print("GPU Name,Target Model,Draft Model,Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size") + print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency}, False") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser.add_argument("--dataset", type=str, default='sharegpt', + help="Path to the dataset.") + parser.add_argument("--input-len", type=int, default=None, + help="Input prompt length for each request.") + parser.add_argument("--output-len", type=int, default=None, + help="Output length for each request. Overrides the output length from the dataset.") + parser.add_argument('--target-model', type=str, + default='facebook/opt-6.7b') + parser.add_argument('--draft-model', type=str, default='facebook/opt-125m') + parser.add_argument('--draft-size', type=int, default=4) + parser.add_argument('--temperature', type=float, default=0.0, + help="Temperature for sampling. -1 for random temperature.") + parser.add_argument('--colocate', '-c', action='store_true') + parser.add_argument('--prefill-schedule-mode', '-psm', choices=[ + 'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill') + parser.add_argument("--target-attention", + action="store_true", help="Use target attention.") + parser.add_argument("--drop-threshold", '-dt', type=float, + default=0, help="Threshold for dropping token.") + parser.add_argument('--budget-token', type=int, default=2048, + help='Maximum number of tokens for each batch.') + parser.add_argument('--budget-seq', type=int, default=256, + help='Maximum number of sequences for each request.') + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument('--quantization', '-q', + choices=['awq', 'gptq', 'squeezellm', None], default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--n", type=int, default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-prompts", type=int, default=1000, + help="Number of prompts to process.") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--hf-max-batch-size", type=int, + default=None, help="Maximum batch size for HF backend.") + parser.add_argument('--trust-remote-code', action='store_true', + help='Trust remote code from Hugging Face.') + parser.add_argument('--max-model-len', type=int, default=None, + help='Maximum length of a sequence (including prompt and output).') + parser.add_argument('--dtype', type=str, default='auto', choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.') + parser.add_argument("--enforce-eager", action="store_true", + help="Enforce eager execution.") + parser.add_argument("--request-rate", type=float, + default=4, help="Number of requests per second.") + + args = parser.parse_args() + + if args.tokenizer is None: + args.tokenizer = args.target_model + if args.dataset is None or args.dataset == "dummy": + args.dataset = "dummy" + + main(args) \ No newline at end of file diff --git a/baseline/chunked_prefill.sh b/baseline/chunked_prefill.sh new file mode 100755 index 0000000000000..1b5c5b050a70a --- /dev/null +++ b/baseline/chunked_prefill.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# Get GPU name using Python script +gpu_name=$(python3 get_gpu_name.py) + +# Cleanup the GPU name +gpu_name=$(echo $gpu_name | tr -d '[:space:]') + +echo "GPU: $gpu_name" + +# Model pairs to benchmark +declare -a models=( + # Uncomment the models you want to benchmark + # "facebook/opt-13b,facebook/opt-125m" + # "facebook/opt-6.7b,facebook/opt-125m" + "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m" + # Add more model pairs as needed +) + +# Common arguments +# Define the request rates, draft sizes, and temperatures +#request_rates=(2 4 6 8 10 12 14 16 18 20) +request_rates=(6) +draft_sizes_ar=(0) +draft_sizes_speculative=(1 3 5 7) +temperatures=(0 0.25 0.5 0.75 -1) + +# Define other default arguments (adjust as necessary) +datasets=("sharegpt") + +# Output CSV file +output_csv="chunked_prefill_A6000.csv" + +# Initialize the CSV file with the header if it doesn't exist +if [ ! -f "$output_csv" ]; then + echo "Result,GPU Name,Target Model,Draft Model,Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token),Preempt Flag" > "$output_csv" +fi + +# Calculate total number of iterations for progress tracking +total_iterations=$(( ${#models[@]} * ${#datasets[@]} * ${#request_rates[@]} * ( ${#draft_sizes_ar[@]} ) )) +iteration=0 + +# AutoRegressive Decoding +# Loop through each model pair, dataset, request rate, and draft size combination +for model_pair in "${models[@]}"; do + IFS=',' read -r target_model draft_model <<< "$model_pair" + for dataset in "${datasets[@]}"; do + for request_rate in "${request_rates[@]}"; do + for draft_size in "${draft_sizes_ar[@]}"; do + iteration=$((iteration + 1)) + ./slack "Progress: $iteration/$total_iterations" + echo "Running AR benchmark with target model: $target_model, draft model: $draft_model, dataset: $dataset, request rate: $request_rate, draft size: $draft_size" + + # Run the benchmark script and append the output to the CSV file + python3 baseline_ar_chunked_prefill.py \ + --dataset "$dataset" \ + --target-model "$target_model" \ + --draft-model "$draft_model" \ + --draft-size "$draft_size" \ + --request-rate "$request_rate" | grep "Result" >> "$output_csv" + + echo "Saved results for target model: $target_model, draft model: $draft_model, dataset: $dataset, request rate: $request_rate, draft size: $draft_size" + done + done + done +done + +echo "All AutoRegressive benchmarks completed." + +# # Speculative Decoding +# # Loop through each model pair, dataset, temperature, request rate, and draft size combination +# for model_pair in "${models[@]}"; do +# IFS=',' read -r target_model draft_model <<< "$model_pair" +# for dataset in "${datasets[@]}"; do +# for temperature in "${temperatures[@]}"; do +# for request_rate in "${request_rates[@]}"; do +# for draft_size in "${draft_sizes_speculative[@]}"; do +# iteration=$((iteration + 1)) +# ./slack "Progress: $iteration/$total_iterations" +# echo "Running Speculative benchmark with target model: $target_model, draft model: $draft_model, dataset: $dataset, temperature: $temperature, request rate: $request_rate, draft size: $draft_size" + +# # Run the benchmark script and append the output to the CSV file +# python3 baseline.py \ +# --dataset "$dataset" \ +# --target-model "$target_model" \ +# --draft-model "$draft_model" \ +# --draft-size "$draft_size" \ +# --temperature "$temperature" \ +# --request-rate "$request_rate" | grep "Result" >> "$output_csv" + +# echo "Saved results for target model: $target_model, draft model: $draft_model, dataset: $dataset, temperature: $temperature, request rate: $request_rate, draft size: $draft_size" +# done +# done +# done +# done +# done + +# echo "All Speculative Decoding benchmarks completed." + +# # Disable Speculative Decoding by Batch Size +# # Define disable size array +# disable_size=(32 64 128) + +# # Loop through each model pair, request rate, draft size, and disable size combination +# for model_pair in "${models[@]}"; do +# IFS=',' read -r target_model draft_model <<< "$model_pair" +# for request_rate in "${request_rates[@]}"; do +# for draft_size in "${draft_sizes_speculative[@]}"; do +# for disable in "${disable_size[@]}"; do +# iteration=$((iteration + 1)) +# ./slack "Progress: $iteration/$total_iterations" +# echo "Running benchmark with target model: $target_model, draft model: $draft_model, request rate: $request_rate, draft size: $draft_size, disable size: $disable" + +# # Run the benchmark script and append the output to the CSV file +# python3 baseline_specdis.py \ +# --dataset "$dataset" \ +# --target-model "$target_model" \ +# --draft-model "$draft_model" \ +# --draft-size "$draft_size" \ +# --temperature "$temperature" \ +# --speculative-disable-by-batch-size "$disable" \ +# --request-rate "$request_rate" | grep "Result" >> "$output_csv" + +# echo "Saved results for target model: $target_model, draft model: $draft_model, request rate: $request_rate, draft size: $draft_size, disable size: $disable" +# done +# done +# done +# done + +# echo "All benchmarks completed." diff --git a/baseline/run_all.sh b/baseline/run_all.sh new file mode 100755 index 0000000000000..7d80389580470 --- /dev/null +++ b/baseline/run_all.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +set -e +set -o pipefail + +export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps + +# Get GPU name using Python script +gpu_name=$(python3 get_gpu_name.py) + +# Cleanup the GPU name +gpu_name=$(echo "$gpu_name" | tr -d '[:space:]') + +echo "GPU: $gpu_name" + +# Model pairs to benchmark +declare -a models=( + # "huggyllama/llama-7b,JackFram/llama-68m" + # "facebook/opt-6.7b,facebook/opt-125m" + "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m" +) + +# Common arguments +datasets=("finance") +temperatures=(0.75) +request_rates=(8) +draft_sizes_ar=(0) +draft_sizes_speculative=(1 3 5 7) +budget_seqs=(64) +prefill_schedule_mode="full_prefill" +colocate="False" +consolidated_attention="False" +drop_threshold="0" +budget_token="4096" + +# Output CSV file +output_csv="baseline_pythia_finance_$gpu_name.csv" + +# Initialize CSV file +initialize_csv() { + if [ ! -f "$output_csv" ]; then + echo "Initializing CSV file: $output_csv" + echo "gpu_name,target_model,draft_model,dataset,temperature,request_rate,draft_size,prefill_schedule_mode,budget_token,budget_seq,colocate,consolidated_attention,drop_threshold,p50_ttft,p99_ttft,p50_tpot,p99_tpot,p50_token_latency,p99_token_latency,token_throughput,request_throughput,token_latency,preempt_flag" > "$output_csv" + fi +} + +# Initialize the CSV file +initialize_csv + +# Function to check if configuration exists in CSV +configuration_exists_in_csv() { + local gpu_name="$1" + local target_model="$2" + local draft_model="$3" + local dataset="$4" + local temperature="$5" + local request_rate="$6" + local draft_size="$7" + local budget_seq="$8" # Add budget_seq parameter + + # Skip header line (NR > 1) + if awk -v OFS=',' -F', *' -v gpu_name="$gpu_name" \ + -v target_model="$target_model" \ + -v draft_model="$draft_model" \ + -v dataset="$dataset" \ + -v temperature="$temperature" \ + -v request_rate="$request_rate" \ + -v draft_size="$draft_size" \ + -v budget_seq="$budget_seq" ' # Include budget_seq in the check + NR > 1 { + for (i=1; i<=NF; i++) { gsub(/^ +| +$/, "", $i) } + if ($1 == gpu_name && $2 == target_model && $3 == draft_model && $4 == dataset && $5 == temperature && $6 == request_rate && $7 == draft_size && $10 == budget_seq) { + found = 1; exit + } + } + END { exit !found } + ' "$output_csv"; then + return 0 # Configuration exists + else + return 1 # Configuration does not exist + fi +} + + +# Function to extract values from the benchmark output +extract_values() { + local log_file="$1" + local result_line=$(grep 'Result' "$log_file") + if [ -z "$result_line" ]; then + echo "Error: No 'result' line found in output." + return 1 + fi + IFS=', ' read -ra metrics <<< "$result_line" + request_throughput="${metrics[1]}" + token_throughput="${metrics[2]}" + token_latency="${metrics[3]}" + p50_ttft="${metrics[4]}" + p99_ttft="${metrics[5]}" + p50_tpot="${metrics[6]}" + p99_tpot="${metrics[7]}" + p50_token_latency="${metrics[8]}" + p99_token_latency="${metrics[9]}" + preempt_flag="${metrics[10]}" +} + +# AutoRegressive Decoding +for budget_seq in ${budget_seqs[@]}; do + for model_pair in "${models[@]}"; do + IFS=',' read -r target_model draft_model <<< "$model_pair" + for dataset in "${datasets[@]}"; do + for request_rate in "${request_rates[@]}"; do + for draft_size in "${draft_sizes_ar[@]}"; do + temperature="0" # AR decoding always uses temperature 0 + log_file="logs/AR_${target_model}_${draft_model}_${dataset}_${request_rate}_${draft_size}_${budget_seq}.log" + mkdir -p $(dirname "$log_file") + + # Check if configuration already exists + if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then + echo "Configuration already exists in CSV. Skipping." + continue + fi + + # Run the benchmark + python3 baseline_ar.py \ + --dataset "$dataset" \ + --target-model "$target_model" \ + --draft-model "$draft_model" \ + --draft-size "$draft_size" \ + --budget-token "$budget_token" \ + --budget-seq "$budget_seq" \ + --request-rate "$request_rate" > "$log_file" 2>&1 + + # Extract values from the log file + if ! extract_values "$log_file"; then + echo "Failed to extract values. Skipping." + continue + fi + + # Append results to CSV + echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv" + done + done + done + done +done + +# Speculative Decoding +for budget_seq in ${budget_seqs[@]}; do + for model_pair in "${models[@]}"; do + IFS=',' read -r target_model draft_model <<< "$model_pair" + for dataset in "${datasets[@]}"; do + for temperature in "${temperatures[@]}"; do + for request_rate in "${request_rates[@]}"; do + for draft_size in "${draft_sizes_speculative[@]}"; do + log_file="logs/speculative_${target_model}_${draft_model}_${dataset}_${temperature}_${request_rate}_${draft_size}_${budget_seq}.log" + mkdir -p $(dirname "$log_file") + + # Check if configuration already exists + if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then + echo "Configuration already exists in CSV. Skipping." + continue + fi + + # Run the benchmark + python3 baseline.py \ + --dataset "$dataset" \ + --target-model "$target_model" \ + --draft-model "$draft_model" \ + --draft-size "$draft_size" \ + --budget-seq "$budget_seq" \ + --budget-token "$budget_token" \ + --temperature "$temperature" \ + --request-rate "$request_rate" > "$log_file" 2>&1 + + # Extract values from the log file + if ! extract_values "$log_file"; then + echo "Failed to extract values. Skipping." + continue + fi + + # Append results to CSV + echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv" + done + done + done + done + done +done + +echo "All benchmarks completed." diff --git a/baseline/run_all_A100.sh b/baseline/run_all_A100.sh new file mode 100755 index 0000000000000..39029c09b1023 --- /dev/null +++ b/baseline/run_all_A100.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +set -e +set -o pipefail + +export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps + +# Get GPU name using Python script +gpu_name=$(python3 get_gpu_name.py) + +# Cleanup the GPU name +gpu_name=$(echo "$gpu_name" | tr -d '[:space:]') + +echo "GPU: $gpu_name" + +# Model pairs to benchmark +declare -a models=( + # "huggyllama/llama-7b,JackFram/llama-68m" + "facebook/opt-6.7b,facebook/opt-125m" + # "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m" +) + +# Common arguments +datasets=("finance") +temperatures=(0) +request_rates=(16) +draft_sizes_ar=(0) +draft_sizes_speculative=(1 3 5 7) +budget_seqs=(128) +prefill_schedule_mode="full_prefill" +colocate="False" +consolidated_attention="False" +drop_threshold="0" +budget_token="4096" + +# Output CSV file +output_csv="baseline_pythia_sharegpt_$gpu_name.csv" + +# Initialize CSV file +initialize_csv() { + if [ ! -f "$output_csv" ]; then + echo "Initializing CSV file: $output_csv" + echo "gpu_name,target_model,draft_model,dataset,temperature,request_rate,draft_size,prefill_schedule_mode,budget_token,budget_seq,colocate,consolidated_attention,drop_threshold,p50_ttft,p99_ttft,p50_tpot,p99_tpot,p50_token_latency,p99_token_latency,token_throughput,request_throughput,token_latency,preempt_flag" > "$output_csv" + fi +} + +# Initialize the CSV file +initialize_csv + +# Function to check if configuration exists in CSV +configuration_exists_in_csv() { + local gpu_name="$1" + local target_model="$2" + local draft_model="$3" + local dataset="$4" + local temperature="$5" + local request_rate="$6" + local draft_size="$7" + local budget_seq="$8" # Add budget_seq parameter + + # Skip header line (NR > 1) + if awk -v OFS=',' -F', *' -v gpu_name="$gpu_name" \ + -v target_model="$target_model" \ + -v draft_model="$draft_model" \ + -v dataset="$dataset" \ + -v temperature="$temperature" \ + -v request_rate="$request_rate" \ + -v draft_size="$draft_size" \ + -v budget_seq="$budget_seq" ' # Include budget_seq in the check + NR > 1 { + for (i=1; i<=NF; i++) { gsub(/^ +| +$/, "", $i) } + if ($1 == gpu_name && $2 == target_model && $3 == draft_model && $4 == dataset && $5 == temperature && $6 == request_rate && $7 == draft_size && $10 == budget_seq) { + found = 1; exit + } + } + END { exit !found } + ' "$output_csv"; then + return 0 # Configuration exists + else + return 1 # Configuration does not exist + fi +} + + +# Function to extract values from the benchmark output +extract_values() { + local log_file="$1" + local result_line=$(grep 'Result' "$log_file") + if [ -z "$result_line" ]; then + echo "Error: No 'result' line found in output." + return 1 + fi + IFS=', ' read -ra metrics <<< "$result_line" + request_throughput="${metrics[1]}" + token_throughput="${metrics[2]}" + token_latency="${metrics[3]}" + p50_ttft="${metrics[4]}" + p99_ttft="${metrics[5]}" + p50_tpot="${metrics[6]}" + p99_tpot="${metrics[7]}" + p50_token_latency="${metrics[8]}" + p99_token_latency="${metrics[9]}" + preempt_flag="${metrics[10]}" +} + +# AutoRegressive Decoding +for budget_seq in ${budget_seqs[@]}; do + for model_pair in "${models[@]}"; do + IFS=',' read -r target_model draft_model <<< "$model_pair" + for dataset in "${datasets[@]}"; do + for request_rate in "${request_rates[@]}"; do + for draft_size in "${draft_sizes_ar[@]}"; do + temperature="0" # AR decoding always uses temperature 0 + log_file="logs/AR_${target_model}_${draft_model}_${dataset}_${request_rate}_${draft_size}_${budget_seq}.log" + mkdir -p $(dirname "$log_file") + + # Check if configuration already exists + if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then + echo "Configuration already exists in CSV. Skipping." + continue + fi + + # Run the benchmark + python3 baseline_ar.py \ + --dataset "$dataset" \ + --target-model "$target_model" \ + --draft-model "$draft_model" \ + --draft-size "$draft_size" \ + --budget-token "$budget_token" \ + --budget-seq "$budget_seq" \ + --request-rate "$request_rate" > "$log_file" 2>&1 + + # Extract values from the log file + if ! extract_values "$log_file"; then + echo "Failed to extract values. Skipping." + continue + fi + + # Append results to CSV + echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv" + done + done + done + done +done + +# Speculative Decoding +# for budget_seq in ${budget_seqs[@]}; do +# for model_pair in "${models[@]}"; do +# IFS=',' read -r target_model draft_model <<< "$model_pair" +# for dataset in "${datasets[@]}"; do +# for temperature in "${temperatures[@]}"; do +# for request_rate in "${request_rates[@]}"; do +# for draft_size in "${draft_sizes_speculative[@]}"; do +# log_file="logs/speculative_${target_model}_${draft_model}_${dataset}_${temperature}_${request_rate}_${draft_size}_${budget_seq}.log" +# mkdir -p $(dirname "$log_file") + +# # Check if configuration already exists +# if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then +# echo "Configuration already exists in CSV. Skipping." +# continue +# fi + +# # Run the benchmark +# python3 baseline.py \ +# --dataset "$dataset" \ +# --target-model "$target_model" \ +# --draft-model "$draft_model" \ +# --draft-size "$draft_size" \ +# --budget-seq "$budget_seq" \ +# --budget-token "$budget_token" \ +# --temperature "$temperature" \ +# --request-rate "$request_rate" > "$log_file" 2>&1 + +# # Extract values from the log file +# if ! extract_values "$log_file"; then +# echo "Failed to extract values. Skipping." +# continue +# fi + +# # Append results to CSV +# echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv" +# done +# done +# done +# done +# done +# done + +# echo "All benchmarks completed."