From e9e415bb6f45c118b92b3314cf9bb53eb2443231 Mon Sep 17 00:00:00 2001
From: Sangjin Choi <sjchoi@casys.kaist.ac.kr>
Date: Thu, 17 Oct 2024 12:33:57 +0900
Subject: [PATCH] baseline scripts

---
 baseline/baseline.py                    | 245 +++++++++++++++++
 baseline/baseline_adaptive.py           | 348 ++++++++++++++++++++++++
 baseline/baseline_adaptive.sh           |   1 +
 baseline/baseline_ar.py                 | 250 +++++++++++++++++
 baseline/baseline_ar_chunked_prefill.py | 238 ++++++++++++++++
 baseline/chunked_prefill.sh             | 130 +++++++++
 baseline/run_all.sh                     | 190 +++++++++++++
 baseline/run_all_A100.sh                | 190 +++++++++++++
 8 files changed, 1592 insertions(+)
 create mode 100644 baseline/baseline.py
 create mode 100644 baseline/baseline_adaptive.py
 create mode 100755 baseline/baseline_adaptive.sh
 create mode 100644 baseline/baseline_ar.py
 create mode 100644 baseline/baseline_ar_chunked_prefill.py
 create mode 100755 baseline/chunked_prefill.sh
 create mode 100755 baseline/run_all.sh
 create mode 100755 baseline/run_all_A100.sh

diff --git a/baseline/baseline.py b/baseline/baseline.py
new file mode 100644
index 0000000000000..35eb9574f01a9
--- /dev/null
+++ b/baseline/baseline.py
@@ -0,0 +1,245 @@
+"""Benchmark offline inference throughput."""
+
+import argparse
+import random
+import time
+import gc
+from itertools import cycle
+from typing import List, Optional, Tuple
+import json
+
+import numpy as np
+from tabulate import tabulate
+from transformers import AutoTokenizer
+import torch
+from dataset import sample_requests 
+
+from vllm import LLM,  SamplingParams
+from vllm.outputs import RequestOutput
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Constants
+DOWNLOAD_DIR = '/mnt/sda/download'
+BENCHMARK_DURATION_IN_MINUTES = 5
+
+# Disable garbage collection for performance
+gc.disable()
+
+if torch.cuda.is_available():
+    gpu_index = 0  # First GPU
+    gpu_name = torch.cuda.get_device_name(gpu_index)
+    print(gpu_name)
+else:
+    print("No CUDA device available")
+
+def get_requests_with_time(input_requests: List[Tuple[str, int, int]],
+                           request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]:
+    """Generates requests with associated times based on a Poisson process."""
+    requests_with_time = []
+    current_time = 0.0
+
+    for request in cycle(input_requests):
+        requests_with_time.append((current_time, request))
+        interval = np.random.exponential(1.0 / request_rate)
+        current_time += interval
+
+        # Add 1 minute to the benchmark duration for safety
+        if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60:
+            break
+
+    return requests_with_time
+
+
+def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]:
+    """Runs the benchmark, processing requests with the given LLM."""
+    requests_with_time = get_requests_with_time(requests, request_rate)
+    outputs: List[RequestOutput] = []
+    result = {}
+
+    start_time = time.perf_counter()
+
+    request_index = 0
+    while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60:
+        current_time = time.perf_counter() - start_time
+
+        # Add requests to the engine if their scheduled time has passed
+        while requests_with_time[request_index][0] <= current_time:
+            request_start_time, (prompt, prompt_len,
+                                 output_len) = requests_with_time[request_index]
+            sampling_params = SamplingParams(
+                n=1,
+                temperature=random.choice(
+                    [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature,
+                top_p=1.0,
+                use_beam_search=False,
+                ignore_eos=True,
+                max_tokens=output_len,
+            )
+            request_id = llm._add_request(
+                inputs=prompt, params=sampling_params)
+            result[str(request_id)] = [request_start_time]
+            request_index += 1
+
+        step_outputs = llm.llm_engine.step()
+        for output in step_outputs:
+            if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1:
+                ttft = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].append(ttft)
+
+            if output.finished:
+                e2e_latency = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].extend(
+                    [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)])
+                outputs.append(output)
+
+        throughput = len(outputs) / (time.perf_counter() - start_time)
+    print(f"Throughput: {throughput:.3f} reqs/s")
+
+    # remove request_id from result if not exist in outputs
+    for request_id in list(result.keys()):
+        if request_id not in [output.request_id for output in outputs]:
+            del result[request_id]
+
+    total_tokens = sum(prompt_len + output_len for _, _, _,
+                       prompt_len, output_len in result.values())
+
+    return result, total_tokens
+
+def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Analyzes the results to compute TTFT, TPOT, and token latencies."""
+    ttfts, tpots, token_latencies = [], [], []
+
+    for _, values in result.items():
+        _, ttft, e2e_latency, _, output_len = values
+        ttfts.append(ttft)
+        tpots.append((e2e_latency - ttft) / (output_len - 1))
+        token_latencies.append(e2e_latency / output_len)
+
+    return np.array(ttfts), np.array(tpots), np.array(token_latencies)
+
+
+def main(args: argparse.Namespace):
+    random.seed(args.seed)
+
+    # Display configuration tables
+    config_table = [
+        ["Target Model", args.target_model],
+        ["Draft Model", args.draft_model],
+        ["Draft Size", args.draft_size],
+        ["Temperature", args.temperature],
+        ["Colocate", args.colocate],
+        ["Prefill Schedule Mode", args.prefill_schedule_mode],
+        ["Budget Token", args.budget_token],
+        ["Budget Seq", args.budget_seq],
+        ["Drop Threshold", args.drop_threshold],
+        ["Target Attention", args.target_attention],
+        ["Dataset", args.dataset],
+        ["Request Rate", args.request_rate],
+    ]
+    print(tabulate(config_table))
+    llm = LLM(
+        model=args.target_model,
+        speculative_model=args.draft_model,
+        num_speculative_tokens=args.draft_size,
+        use_v2_block_manager=True,
+        gpu_memory_utilization=0.85,
+    )
+
+    # Sample the requests
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    # 0 indicates all requests
+    requests = sample_requests(args.dataset, 0, tokenizer)
+
+    # Run the benchmark
+    start_time = time.perf_counter()
+    result, total_tokens = run(
+        llm, requests, args.request_rate, args.temperature)
+    elapsed_time = time.perf_counter() - start_time
+
+    # Analyze results
+    ttfts, tpots, token_latencies = analyze_results(result)
+
+    # Main results
+    request_throughput = len(result) / elapsed_time
+    token_throughput = total_tokens / elapsed_time
+    # token_latency: the average processing time per output token
+    token_latency = np.mean(token_latencies)
+
+    # Sub results
+    p50_ttft = np.percentile(ttfts, 50)
+    p99_ttft = np.percentile(ttfts, 99)
+    p50_tpot = np.percentile(tpots, 50)
+    p99_tpot = np.percentile(tpots, 99)
+    p50_token_latency = np.percentile(token_latencies, 50)
+    p99_token_latency = np.percentile(token_latencies, 99)
+
+    #remove spaces in gpu_name
+    gpu_index = 0  # First GPU
+    gpu_name = torch.cuda.get_device_name(gpu_index)
+    gpu_name = gpu_name.replace(" ", "")
+
+    # print("GPU Name,Target Model, Draft Model, Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size")
+    # print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency},False")
+    print(f"Result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, False")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--dataset", type=str, default='sharegpt',
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len", type=int, default=None,
+                        help="Input prompt length for each request.")
+    parser.add_argument("--output-len", type=int, default=None,
+                        help="Output length for each request. Overrides the output length from the dataset.")
+    parser.add_argument('--target-model', type=str,
+                        default='facebook/opt-6.7b')
+    parser.add_argument('--draft-model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--draft-size', type=int, default=4)
+    parser.add_argument('--temperature', type=float, default=0.0,
+                        help="Temperature for sampling. -1 for random temperature.")
+    parser.add_argument('--colocate', '-c', action='store_true')
+    parser.add_argument('--prefill-schedule-mode', '-psm', choices=[
+                        'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill')
+    parser.add_argument("--target-attention",
+                        action="store_true", help="Use target attention.")
+    parser.add_argument("--drop-threshold", '-dt', type=float,
+                        default=0, help="Threshold for dropping token.")
+    parser.add_argument('--budget-token', type=int, default=2048,
+                        help='Maximum number of tokens for each batch.')
+    parser.add_argument('--budget-seq', type=int, default=64,
+                        help='Maximum number of sequences for each request.')
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization', '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts", type=int, default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int,
+                        default=None, help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='Trust remote code from Hugging Face.')
+    parser.add_argument('--max-model-len', type=int, default=None,
+                        help='Maximum length of a sequence (including prompt and output).')
+    parser.add_argument('--dtype', type=str, default='auto', choices=[
+                        'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.')
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="Enforce eager execution.")
+    parser.add_argument("--request-rate", type=float,
+                        default=4, help="Number of requests per second.")
+    parser.add_argument("--speculative-disable-by-batch-size", type=int, default=4)
+
+    args = parser.parse_args()
+
+    if args.tokenizer is None:
+        args.tokenizer = args.target_model
+    if args.dataset is None or args.dataset == "dummy":
+        args.dataset = "dummy"
+
+    main(args)
\ No newline at end of file
diff --git a/baseline/baseline_adaptive.py b/baseline/baseline_adaptive.py
new file mode 100644
index 0000000000000..eb24f9911a877
--- /dev/null
+++ b/baseline/baseline_adaptive.py
@@ -0,0 +1,348 @@
+import argparse
+import random
+import time
+import gc
+from itertools import cycle
+from typing import List, Optional, Tuple
+
+import numpy as np
+from tabulate import tabulate
+from transformers import AutoTokenizer
+from dataset import sample_requests 
+
+from vllm import LLM,  SamplingParams
+from vllm.outputs import RequestOutput
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Constants
+DOWNLOAD_DIR = '/mnt/sda/download'
+
+# Disable garbage collection for performance
+gc.disable()
+
+# Global variables for benchmark duration
+BENCHMARK_DURATION_IN_MINUTES = 3
+
+def get_requests_with_time(input_requests: List[Tuple[str, int, int]],
+                           high_request_rate: float,
+                           mid_request_rate: float,
+                           low_request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]:
+    """
+    Generates requests with associated times based on a custom request rate pattern:
+    Long Low Rate -> Mid Rate -> Short High Rate -> Mid Rate -> Long Low Rate
+    """
+    requests_with_time = []
+    current_time = 0.0
+    total_duration = BENCHMARK_DURATION_IN_MINUTES * 60  # Total benchmark duration in seconds
+
+    # Define the durations for each phase
+    phase_durations = [
+        total_duration / 5, # Phase 1: Low Rate 
+        total_duration / 5,  # Phase 2: Mid Rate
+        total_duration / 5,  # Phase 3: High Rate 
+        total_duration / 5,  # Phase 4: Mid Rate 
+        total_duration / 5  # Phase 5: Low Rate
+    ]
+
+    # Define the phases with their corresponding request rates
+    phases = [
+        (low_request_rate, phase_durations[0]),   # Phase 1: Low Rate
+        (mid_request_rate, phase_durations[1]),   # Phase 2: Mid Rate
+        (high_request_rate, phase_durations[2]),  # Phase 3: High Rate
+        (mid_request_rate, phase_durations[3]),   # Phase 4: Mid Rate
+        (low_request_rate, phase_durations[4]),   # Phase 5: Low Rate
+    ]
+
+    phase_index = 0
+    current_request_rate, phase_duration = phases[phase_index]
+    time_period_end = phase_duration
+
+    for request in cycle(input_requests):
+        # Generate inter-arrival time based on current request rate
+        if current_request_rate > 0:
+            interval = np.random.exponential(1.0 / current_request_rate)
+        else:
+            interval = float('inf')
+        current_time += interval
+
+        # Update request rate based on time
+        if current_time > time_period_end and phase_index < len(phases) - 1:
+            phase_index += 1
+            current_request_rate, phase_duration = phases[phase_index]
+            time_period_end += phase_duration  # Accumulate the durations for time thresholds
+
+        if current_time > total_duration:
+            break
+
+        requests_with_time.append((current_time, request))
+
+    return requests_with_time
+
+def warmup(llm):
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0,
+        top_p=1.0,
+        use_beam_search=False,
+        ignore_eos=True,
+        max_tokens=128,
+    )
+    dummy_prompt_token_ids = [[0] * 32] * 8
+    start_time = time.perf_counter()
+    llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                 sampling_params=sampling_params,
+                 use_tqdm=False)
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+    return latency
+
+def run(llm: LLM, requests: List[Tuple[str, int, int]],
+        high_request_rate: float, mid_request_rate: float, low_request_rate: float,
+        temperature: float) -> Tuple[dict, int, bool, List[Tuple[float, float]], List[Tuple[float, float]]]:
+    """Runs the benchmark, processing requests with the given LLM."""
+    requests_with_time = get_requests_with_time(requests, high_request_rate, mid_request_rate, low_request_rate)
+    outputs: List[RequestOutput] = []
+    result = {}
+    token_latencies_over_time = []
+    token_throughput_over_time = []  # New: To store token throughput over time
+    requests_over_time = []
+
+    interval_duration = 10.0  # Interval duration in seconds
+    next_interval_time = interval_duration  # Time when the next interval ends
+    tokens_in_interval = 0  # Tokens processed in the current interval
+
+    start_time = time.perf_counter()
+
+    total_processed_tokens = 0
+    request_index = 0
+    while True:
+        current_time = time.perf_counter() - start_time
+
+        # Add requests to the engine if their scheduled time has passed
+        while request_index < len(requests_with_time) and requests_with_time[request_index][0] <= current_time:
+            request_start_time, (prompt, prompt_len,
+                                 output_len) = requests_with_time[request_index]
+            sampling_params = SamplingParams(
+                n=1,
+                temperature=random.choice(
+                    [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature,
+                top_p=1.0,
+                use_beam_search=False,
+                ignore_eos=True,
+                max_tokens=output_len,
+            )
+            request_id = llm._add_request(
+                inputs=prompt, params=sampling_params)
+            result[request_id] = [request_start_time]
+            request_index += 1
+
+        step_outputs = llm.llm_engine.step()
+        for output in step_outputs:
+            if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1:
+                ttft = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].append(ttft)
+
+            if output.finished:
+                current_time_in_run = time.perf_counter() - start_time
+                e2e_latency = current_time_in_run - \
+                    result[output.request_id][0]
+                result[output.request_id].extend(
+                    [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)])
+                outputs.append(output)
+                num_tokens = len(output.prompt_token_ids) + len(output.outputs[0].token_ids)
+                total_processed_tokens += num_tokens
+                tokens_in_interval += num_tokens  # Update tokens in the current interval
+
+                # Calculate token latency
+                token_latency = e2e_latency / len(output.outputs[0].token_ids)
+                token_latencies_over_time.append((current_time_in_run, token_latency))
+        
+                num_request = len(step_outputs)
+                requests_over_time.append((time.perf_counter() - start_time, num_request))
+
+        # Check if we've reached the end of the current interval
+        if current_time >= next_interval_time:
+            interval_throughput = tokens_in_interval / interval_duration
+            token_throughput_over_time.append((next_interval_time, interval_throughput))
+            tokens_in_interval = 0  # Reset token count for next interval
+            next_interval_time += interval_duration
+            print(f"Token Throughput at {next_interval_time}s: {interval_throughput:.3f} tokens/s")
+
+        throughput = total_processed_tokens / \
+            (time.perf_counter() - start_time)
+        print(f"Throughput: {throughput:.3f} tokens/s")
+
+        if not llm.llm_engine.has_unfinished_requests() and (current_time > BENCHMARK_DURATION_IN_MINUTES * 60):
+            break
+
+    # Remove request_id from result if not exist in outputs
+    for request_id in list(result.keys()):
+        if request_id not in [output.request_id for output in outputs]:
+            del result[request_id]
+
+    total_tokens = sum(prompt_len + output_len for _, _, _,
+                       prompt_len, output_len in result.values())
+
+    return result, total_tokens,  token_latencies_over_time, token_throughput_over_time, requests_over_time  # Return the throughput data
+
+def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Analyzes the results to compute TTFT, TPOT, and token latencies."""
+    ttfts, tpots, token_latencies = [], [], []
+
+    for _, values in result.items():
+        _, ttft, e2e_latency, _, output_len = values
+        ttfts.append(ttft)
+        tpots.append((e2e_latency - ttft) / (output_len - 1))
+        token_latencies.append(e2e_latency / output_len)
+
+    return np.array(ttfts), np.array(tpots), np.array(token_latencies)
+
+def main(args: argparse.Namespace):
+    random.seed(args.seed)
+
+    # Display configuration tables
+    config_table = [
+        ["Target Model", args.target_model],
+        ["Draft Model", args.draft_model],
+        ["Draft Size", args.draft_size],
+        ["Temperature", args.temperature],
+        ["Colocate", args.colocate],
+        ["Prefill Schedule Mode", args.prefill_schedule_mode],
+        ["Budget Token", args.budget_token],
+        ["Budget Seq", args.budget_seq],
+        ["Selective Validation", args.selective_validation],
+        ["Drop Threshold", args.drop_threshold],
+        ["Consolidated Attention", args.consolidated_attention],
+        ["Dataset", args.dataset],
+        ["High Request Rate", args.high_request_rate],
+        ["Mid Request Rate", args.mid_request_rate],
+        ["Low Request Rate", args.low_request_rate],
+        ["Benchmark Duration (min)", args.benchmark_duration],
+    ]
+    print(tabulate(config_table))
+
+    global BENCHMARK_DURATION_IN_MINUTES
+    BENCHMARK_DURATION_IN_MINUTES = args.benchmark_duration
+
+    llm = LLM(
+        model=args.target_model,
+        gpu_memory_utilization=0.85,
+    )
+
+    # Sample the requests
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+
+    warmup(llm)
+
+    # 0 indicates all requests
+    requests = sample_requests(args.dataset, 0, tokenizer)
+    # Run the benchmark
+    start_time = time.perf_counter()
+    result, total_tokens, token_latencies_over_time, token_throughput_over_time, requests_over_time = run(
+        llm, requests, args.high_request_rate, args.mid_request_rate, args.low_request_rate, args.temperature)
+    elapsed_time = time.perf_counter() - start_time
+
+    # Analyze results
+    ttfts, tpots, token_latencies = analyze_results(result)
+
+    # Main results
+    request_throughput = len(result) / elapsed_time
+    token_throughput = total_tokens / elapsed_time
+    # token_latency: the average processing time per output token
+    token_latency = np.mean(token_latencies)
+
+    # Sub results
+    p50_ttft = np.percentile(ttfts, 50)
+    p99_ttft = np.percentile(ttfts, 99)
+    p50_tpot = np.percentile(tpots, 50)
+    p99_tpot = np.percentile(tpots, 99)
+    p50_token_latency = np.percentile(token_latencies, 50)
+    p99_token_latency = np.percentile(token_latencies, 99)
+
+    # Print all results in csv format
+    # print("Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),"
+    #       "P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token),"
+    #       "Preempt Flag")
+    # print(f"result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, {preempt_flag}")
+
+    file_name_prefix = f"AR"
+
+    # Write token latencies over time to a CSV file
+    with open(f'token_latencies_over_time_{file_name_prefix}.csv', 'w') as f:
+        f.write('Time(s),Token_Latency(s/token)\n')
+        for time_point, latency in token_latencies_over_time:
+            f.write(f"{time_point},{latency}\n")
+
+    # Write token throughput over time to a CSV file
+    with open(f'token_throughput_over_time_{file_name_prefix}.csv', 'w') as f:
+        f.write('Time(s),Token_Throughput(tokens/s)\n')
+        for time_point, throughput in token_throughput_over_time:
+            f.write(f"{time_point},{throughput}\n")
+
+    with open(f'requests_over_time_{file_name_prefix}.csv', 'w') as f:
+        f.write('Time(s),Num_Requests\n')
+        for time_point, num_requests in requests_over_time:
+            f.write(f"{time_point},{num_requests}\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--dataset", type=str, default=None,
+                        help="Path to the dataset.")
+    parser.add_argument('--target-model', type=str,
+                        default='facebook/opt-6.7b')
+    parser.add_argument('--draft-model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--draft-size', type=int, default=0)
+    parser.add_argument('--temperature', type=float, default=0.0,
+                        help="Temperature for sampling. -1 for random temperature in [0, 0.25, 0.5, 0.75].")
+    parser.add_argument('--colocate', '-c', action='store_true')
+    parser.add_argument('--prefill-schedule-mode', '-psm', choices=[
+                        'full_prefill', 'chunked_prefill'], default='full_prefill')
+    parser.add_argument("--consolidated-attention",
+                        action="store_true", help="Use consolidated attention.")
+    parser.add_argument("--selective-validation", action="store_true")
+    parser.add_argument("--drop-threshold", '-dt', type=float,
+                        default=0, help="Threshold for dropping token.")
+    parser.add_argument('--budget-token', type=int, default=2048,
+                        help='Maximum number of tokens for each batch.')
+    parser.add_argument('--budget-seq', type=int, default=128,
+                        help='Maximum number of sequences for each request.')
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization', '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts", type=int, default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int,
+                        default=None, help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='Trust remote code from Hugging Face.')
+    parser.add_argument('--max-model-len', type=int, default=None,
+                        help='Maximum length of a sequence (including prompt and output).')
+    parser.add_argument('--dtype', type=str, default='auto', choices=[
+                        'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.')
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="Enforce eager execution.")
+    parser.add_argument('--benchmark-duration', type=int, default=5,
+                        help='Benchmark duration in minutes.')
+    parser.add_argument('--low-request-rate', type=float, default=6,
+                        help='Low request rate for changing arrival pattern.')
+    parser.add_argument('--mid-request-rate', type=float, default=12,
+                        help='Mid request rate for changing arrival pattern.')
+    parser.add_argument('--high-request-rate', type=float, default=24,
+                        help='High request rate for changing arrival pattern.')
+
+    args = parser.parse_args()
+
+    if args.tokenizer is None:
+        args.tokenizer = args.target_model
+
+    assert args.dataset is not None
+
+    main(args)
diff --git a/baseline/baseline_adaptive.sh b/baseline/baseline_adaptive.sh
new file mode 100755
index 0000000000000..f7602bbea1711
--- /dev/null
+++ b/baseline/baseline_adaptive.sh
@@ -0,0 +1 @@
+python baseline_adaptive.py --dataset gsm8k --temperature 0.5  --prefill-schedule-mode full_prefill --budget-token 4096 --budget-seq 256 --target-model facebook/opt-6.7b --draft-model facebook/opt-125m 
diff --git a/baseline/baseline_ar.py b/baseline/baseline_ar.py
new file mode 100644
index 0000000000000..2bbf7852ab3ec
--- /dev/null
+++ b/baseline/baseline_ar.py
@@ -0,0 +1,250 @@
+"""Benchmark offline inference throughput."""
+
+import argparse
+import random
+import time
+import gc
+from itertools import cycle
+from typing import List, Optional, Tuple
+import json
+
+import numpy as np
+from tabulate import tabulate
+from transformers import AutoTokenizer
+import torch
+from dataset import sample_requests 
+
+from vllm import LLM,  SamplingParams
+from vllm.outputs import RequestOutput
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Constants
+DOWNLOAD_DIR = '/mnt/sda/download'
+BENCHMARK_DURATION_IN_MINUTES = 5
+
+# Disable garbage collection for performance
+gc.disable()
+
+if torch.cuda.is_available():
+    gpu_index = 0  # First GPU
+    gpu_name = torch.cuda.get_device_name(gpu_index)
+    print(gpu_name)
+else:
+    print("No CUDA device available")
+
+def get_requests_with_time(input_requests: List[Tuple[str, int, int]],
+                           request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]:
+    """Generates requests with associated times based on a Poisson process."""
+    requests_with_time = []
+    current_time = 0.0
+
+    for request in cycle(input_requests):
+        requests_with_time.append((current_time, request))
+        interval = np.random.exponential(1.0 / request_rate)
+        current_time += interval
+
+        # Add 1 minute to the benchmark duration for safety
+        if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60:
+            break
+
+    return requests_with_time
+
+
+def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]:
+    """Runs the benchmark, processing requests with the given LLM."""
+    requests_with_time = get_requests_with_time(requests, request_rate)
+    outputs: List[RequestOutput] = []
+    result = {}
+
+    start_time = time.perf_counter()
+
+    request_index = 0
+    while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60:
+        current_time = time.perf_counter() - start_time
+
+        # Add requests to the engine if their scheduled time has passed
+        while requests_with_time[request_index][0] <= current_time:
+            request_start_time, (prompt, prompt_len,
+                                 output_len) = requests_with_time[request_index]
+            sampling_params = SamplingParams(
+                n=1,
+                temperature=random.choice(
+                    [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature,
+                top_p=1.0,
+                use_beam_search=False,
+                ignore_eos=True,
+                max_tokens=output_len,
+            )
+            request_id = llm._add_request(
+                inputs=prompt, params=sampling_params)
+            result[str(request_id)] = [request_start_time]
+            request_index += 1
+
+        step_outputs = llm.llm_engine.step()
+        for output in step_outputs:
+            if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1:
+                ttft = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].append(ttft)
+
+            if output.finished:
+                e2e_latency = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].extend(
+                    [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)])
+                outputs.append(output)
+
+        throughput = len(outputs) / (time.perf_counter() - start_time)
+    print(f"Throughput: {throughput:.3f} reqs/s")
+
+    # remove request_id from result if not exist in outputs
+    for request_id in list(result.keys()):
+        if request_id not in [output.request_id for output in outputs]:
+            del result[request_id]
+
+    total_tokens = sum(prompt_len + output_len for _, _, _,
+                       prompt_len, output_len in result.values())
+
+    return result, total_tokens
+
+
+def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Analyzes the results to compute TTFT, TPOT, and token latencies."""
+    ttfts, tpots, token_latencies = [], [], []
+
+    for _, values in result.items():
+        _, ttft, e2e_latency, _, output_len = values
+        ttfts.append(ttft)
+        tpots.append((e2e_latency - ttft) / (output_len - 1))
+        token_latencies.append(e2e_latency / output_len)
+
+    return np.array(ttfts), np.array(tpots), np.array(token_latencies)
+
+
+def main(args: argparse.Namespace):
+    random.seed(args.seed)
+
+    # Display configuration tables
+    config_table = [
+        ["Target Model", args.target_model],
+        ["Draft Model", args.draft_model],
+        ["Draft Size", args.draft_size],
+        ["Temperature", args.temperature],
+        ["Colocate", args.colocate],
+        ["Prefill Schedule Mode", args.prefill_schedule_mode],
+        ["Budget Token", args.budget_token],
+        ["Budget Seq", args.budget_seq],
+        ["Drop Threshold", args.drop_threshold],
+        ["Target Attention", args.target_attention],
+        ["Dataset", args.dataset],
+        ["Request Rate", args.request_rate],
+    ]
+    print(tabulate(config_table))
+    llm = LLM(
+        model=args.target_model,
+        gpu_memory_utilization=0.85,
+    )
+
+    # Sample the requests
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    # 0 indicates all requests
+    requests = sample_requests(args.dataset, 0, tokenizer)
+
+    # Run the benchmark
+    start_time = time.perf_counter()
+    result, total_tokens = run(
+        llm, requests, args.request_rate, args.temperature)
+    elapsed_time = time.perf_counter() - start_time
+
+    # Analyze results
+    ttfts, tpots, token_latencies = analyze_results(result)
+
+    # Main results
+    request_throughput = len(result) / elapsed_time
+    token_throughput = total_tokens / elapsed_time
+    # token_latency: the average processing time per output token
+    token_latency = np.mean(token_latencies)
+
+    # Sub results
+    p50_ttft = np.percentile(ttfts, 50)
+    p99_ttft = np.percentile(ttfts, 99)
+    p50_tpot = np.percentile(tpots, 50)
+    p99_tpot = np.percentile(tpots, 99)
+    p50_token_latency = np.percentile(token_latencies, 50)
+    p99_token_latency = np.percentile(token_latencies, 99)
+
+    #remove spaces in gpu_name
+    gpu_index = 0  # First GPU
+    gpu_name = torch.cuda.get_device_name(gpu_index)
+    gpu_name = gpu_name.replace(" ", "")
+
+    # print("GPU Name,Target Model, Draft Model, Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size")
+    # print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency},False")
+
+
+    # # Print all results in csv format
+    # print("Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),"
+    #       "P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token),"
+    #       "Preempt Flag")
+    print(f"Result, {request_throughput:.3f}, {token_throughput:.3f}, {token_latency:.6f}, {p50_ttft:.6f}, {p99_ttft:.6f}, {p50_tpot:.6f}, {p99_tpot:.6f}, {p50_token_latency:.6f}, {p99_token_latency:.6f}, False")
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--dataset", type=str, default='sharegpt',
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len", type=int, default=None,
+                        help="Input prompt length for each request.")
+    parser.add_argument("--output-len", type=int, default=None,
+                        help="Output length for each request. Overrides the output length from the dataset.")
+    parser.add_argument('--target-model', type=str,
+                        default='facebook/opt-6.7b')
+    parser.add_argument('--draft-model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--draft-size', type=int, default=4)
+    parser.add_argument('--temperature', type=float, default=0.0,
+                        help="Temperature for sampling. -1 for random temperature.")
+    parser.add_argument('--colocate', '-c', action='store_true')
+    parser.add_argument('--prefill-schedule-mode', '-psm', choices=[
+                        'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill')
+    parser.add_argument("--target-attention",
+                        action="store_true", help="Use target attention.")
+    parser.add_argument("--drop-threshold", '-dt', type=float,
+                        default=0, help="Threshold for dropping token.")
+    parser.add_argument('--budget-token', type=int, default=2048,
+                        help='Maximum number of tokens for each batch.')
+    parser.add_argument('--budget-seq', type=int, default=64,
+                        help='Maximum number of sequences for each request.')
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization', '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts", type=int, default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int,
+                        default=None, help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='Trust remote code from Hugging Face.')
+    parser.add_argument('--max-model-len', type=int, default=None,
+                        help='Maximum length of a sequence (including prompt and output).')
+    parser.add_argument('--dtype', type=str, default='auto', choices=[
+                        'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.')
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="Enforce eager execution.")
+    parser.add_argument("--request-rate", type=float,
+                        default=4, help="Number of requests per second.")
+
+    args = parser.parse_args()
+
+    if args.tokenizer is None:
+        args.tokenizer = args.target_model
+    if args.dataset is None or args.dataset == "dummy":
+        args.dataset = "dummy"
+
+    main(args)
\ No newline at end of file
diff --git a/baseline/baseline_ar_chunked_prefill.py b/baseline/baseline_ar_chunked_prefill.py
new file mode 100644
index 0000000000000..34c14ab381f0d
--- /dev/null
+++ b/baseline/baseline_ar_chunked_prefill.py
@@ -0,0 +1,238 @@
+"""Benchmark offline inference throughput."""
+
+import argparse
+import random
+import time
+import gc
+from itertools import cycle
+from typing import List, Optional, Tuple
+import json
+
+import numpy as np
+from tabulate import tabulate
+from transformers import AutoTokenizer
+import torch
+from dataset import sample_requests 
+
+from vllm import LLM,  SamplingParams
+from vllm.outputs import RequestOutput
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# Constants
+DOWNLOAD_DIR = '/mnt/sda/download'
+BENCHMARK_DURATION_IN_MINUTES = 5
+
+# Disable garbage collection for performance
+gc.disable()
+
+if torch.cuda.is_available():
+    gpu_index = 0  # First GPU
+    gpu_name = torch.cuda.get_device_name(gpu_index)
+    print(gpu_name)
+else:
+    print("No CUDA device available")
+
+def get_requests_with_time(input_requests: List[Tuple[str, int, int]],
+                           request_rate: float) -> List[Tuple[float, Tuple[str, int, int]]]:
+    """Generates requests with associated times based on a Poisson process."""
+    requests_with_time = []
+    current_time = 0.0
+
+    for request in cycle(input_requests):
+        requests_with_time.append((current_time, request))
+        interval = np.random.exponential(1.0 / request_rate)
+        current_time += interval
+
+        # Add 1 minute to the benchmark duration for safety
+        if current_time > (BENCHMARK_DURATION_IN_MINUTES + 1) * 60:
+            break
+
+    return requests_with_time
+
+
+def run(llm: LLM, requests: List[Tuple[str, int, int]], request_rate: float, temperature: float) -> Tuple[dict, int, bool]:
+    """Runs the benchmark, processing requests with the given LLM."""
+    requests_with_time = get_requests_with_time(requests, request_rate)
+    outputs: List[RequestOutput] = []
+    result = {}
+
+    start_time = time.perf_counter()
+
+    request_index = 0
+    while time.perf_counter() - start_time < BENCHMARK_DURATION_IN_MINUTES * 60:
+        current_time = time.perf_counter() - start_time
+
+        # Add requests to the engine if their scheduled time has passed
+        while requests_with_time[request_index][0] <= current_time:
+            request_start_time, (prompt, prompt_len,
+                                 output_len) = requests_with_time[request_index]
+            sampling_params = SamplingParams(
+                n=1,
+                temperature=random.choice(
+                    [0, 0.25, 0.5, 0.75]) if temperature == -1 else temperature,
+                top_p=1.0,
+                use_beam_search=False,
+                ignore_eos=True,
+                max_tokens=output_len,
+            )
+            request_id = llm._add_request(
+                inputs=prompt, params=sampling_params)
+            result[str(request_id)] = [request_start_time]
+            request_index += 1
+
+        step_outputs = llm.llm_engine.step()
+        for output in step_outputs:
+            if len(output.outputs[0].token_ids) == 1 and len(result[output.request_id]) == 1:
+                ttft = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].append(ttft)
+
+            if output.finished:
+                e2e_latency = time.perf_counter() - start_time - \
+                    result[output.request_id][0]
+                result[output.request_id].extend(
+                    [e2e_latency, len(output.prompt_token_ids), len(output.outputs[0].token_ids)])
+                outputs.append(output)
+
+        throughput = len(outputs) / (time.perf_counter() - start_time)
+    print(f"Throughput: {throughput:.3f} reqs/s")
+
+    # remove request_id from result if not exist in outputs
+    for request_id in list(result.keys()):
+        if request_id not in [output.request_id for output in outputs]:
+            del result[request_id]
+
+    total_tokens = sum(prompt_len + output_len for _, _, _,
+                       prompt_len, output_len in result.values())
+
+    return result, total_tokens
+
+
+def analyze_results(result: dict) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Analyzes the results to compute TTFT, TPOT, and token latencies."""
+    ttfts, tpots, token_latencies = [], [], []
+
+    for _, values in result.items():
+        _, ttft, e2e_latency, _, output_len = values
+        ttfts.append(ttft)
+        tpots.append((e2e_latency - ttft) / (output_len - 1))
+        token_latencies.append(e2e_latency / output_len)
+
+    return np.array(ttfts), np.array(tpots), np.array(token_latencies)
+
+
+def main(args: argparse.Namespace):
+    random.seed(args.seed)
+
+    # Display configuration tables
+    config_table = [
+        ["Target Model", args.target_model],
+        ["Draft Model", args.draft_model],
+        ["Draft Size", args.draft_size],
+        ["Temperature", args.temperature],
+        ["Colocate", args.colocate],
+        ["Prefill Schedule Mode", args.prefill_schedule_mode],
+        ["Budget Token", args.budget_token],
+        ["Budget Seq", args.budget_seq],
+        ["Drop Threshold", args.drop_threshold],
+        ["Target Attention", args.target_attention],
+        ["Dataset", args.dataset],
+        ["Request Rate", args.request_rate],
+    ]
+    print(tabulate(config_table))
+    llm = LLM(
+        model=args.target_model,
+        gpu_memory_utilization=0.85,
+        enable_chunked_prefill=True,
+        max_num_seqs=args.budget_seq,
+    )
+
+    # Sample the requests
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    # 0 indicates all requests
+    requests = sample_requests(args.dataset, 0, tokenizer)
+
+    # Run the benchmark
+    start_time = time.perf_counter()
+    result, total_tokens = run(
+        llm, requests, args.request_rate, args.temperature)
+    elapsed_time = time.perf_counter() - start_time
+
+    # Analyze results
+    ttfts, tpots, token_latencies = analyze_results(result)
+
+    # Main results
+    request_throughput = len(result) / elapsed_time
+    token_throughput = total_tokens / elapsed_time
+    # token_latency: the average processing time per output token
+    token_latency = np.mean(token_latencies)
+
+    # Sub results
+    p50_ttft = np.percentile(ttfts, 50)
+    p99_ttft = np.percentile(ttfts, 99)
+    p50_tpot = np.percentile(tpots, 50)
+    p99_tpot = np.percentile(tpots, 99)
+    p50_token_latency = np.percentile(token_latencies, 50)
+    p99_token_latency = np.percentile(token_latencies, 99)
+
+    print("GPU Name,Target Model,Draft Model,Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token), Disable by Batch Size")
+    print(f"Result,{gpu_name},{args.target_model},{args.draft_model},{args.dataset},{args.temperature},{args.request_rate},{args.draft_size},{request_throughput},{token_throughput},{token_latency},{p50_ttft},{p99_ttft},{p50_tpot},{p99_tpot},{p50_token_latency},{p99_token_latency}, False")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--dataset", type=str, default='sharegpt',
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len", type=int, default=None,
+                        help="Input prompt length for each request.")
+    parser.add_argument("--output-len", type=int, default=None,
+                        help="Output length for each request. Overrides the output length from the dataset.")
+    parser.add_argument('--target-model', type=str,
+                        default='facebook/opt-6.7b')
+    parser.add_argument('--draft-model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--draft-size', type=int, default=4)
+    parser.add_argument('--temperature', type=float, default=0.0,
+                        help="Temperature for sampling. -1 for random temperature.")
+    parser.add_argument('--colocate', '-c', action='store_true')
+    parser.add_argument('--prefill-schedule-mode', '-psm', choices=[
+                        'prioritize_prefill', 'full_prefill', 'chunked_prefill', 'chunked_prefill_demote_draft'], default='full_prefill')
+    parser.add_argument("--target-attention",
+                        action="store_true", help="Use target attention.")
+    parser.add_argument("--drop-threshold", '-dt', type=float,
+                        default=0, help="Threshold for dropping token.")
+    parser.add_argument('--budget-token', type=int, default=2048,
+                        help='Maximum number of tokens for each batch.')
+    parser.add_argument('--budget-seq', type=int, default=256,
+                        help='Maximum number of sequences for each request.')
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization', '-q',
+                        choices=['awq', 'gptq', 'squeezellm', None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n", type=int, default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-prompts", type=int, default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size", type=int,
+                        default=None, help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='Trust remote code from Hugging Face.')
+    parser.add_argument('--max-model-len', type=int, default=None,
+                        help='Maximum length of a sequence (including prompt and output).')
+    parser.add_argument('--dtype', type=str, default='auto', choices=[
+                        'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], help='Data type for model weights and activations.')
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="Enforce eager execution.")
+    parser.add_argument("--request-rate", type=float,
+                        default=4, help="Number of requests per second.")
+
+    args = parser.parse_args()
+
+    if args.tokenizer is None:
+        args.tokenizer = args.target_model
+    if args.dataset is None or args.dataset == "dummy":
+        args.dataset = "dummy"
+
+    main(args)
\ No newline at end of file
diff --git a/baseline/chunked_prefill.sh b/baseline/chunked_prefill.sh
new file mode 100755
index 0000000000000..1b5c5b050a70a
--- /dev/null
+++ b/baseline/chunked_prefill.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# Get GPU name using Python script
+gpu_name=$(python3 get_gpu_name.py)
+
+# Cleanup the GPU name
+gpu_name=$(echo $gpu_name | tr -d '[:space:]')
+
+echo "GPU: $gpu_name"
+
+# Model pairs to benchmark
+declare -a models=(
+    # Uncomment the models you want to benchmark
+    # "facebook/opt-13b,facebook/opt-125m"
+    # "facebook/opt-6.7b,facebook/opt-125m"
+    "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m"
+    # Add more model pairs as needed
+)
+
+# Common arguments
+# Define the request rates, draft sizes, and temperatures
+#request_rates=(2 4 6 8 10 12 14 16 18 20)
+request_rates=(6)
+draft_sizes_ar=(0)
+draft_sizes_speculative=(1 3 5 7)
+temperatures=(0 0.25 0.5 0.75 -1)
+
+# Define other default arguments (adjust as necessary)
+datasets=("sharegpt")
+
+# Output CSV file
+output_csv="chunked_prefill_A6000.csv"
+
+# Initialize the CSV file with the header if it doesn't exist
+if [ ! -f "$output_csv" ]; then
+    echo "Result,GPU Name,Target Model,Draft Model,Dataset,Temperature,Request Rate,Draft Size,Request Throughput (reqs/s),Token Throughput (tokens/s),Token Latency (s/token),P50 TTFT (s),P99 TTFT (s),P50 TPOT (s/token),P99 TPOT (s/token),P50 Token Latency (s/token),P99 Token Latency (s/token),Preempt Flag" > "$output_csv"
+fi
+
+# Calculate total number of iterations for progress tracking
+total_iterations=$(( ${#models[@]} * ${#datasets[@]} * ${#request_rates[@]} * ( ${#draft_sizes_ar[@]} ) ))
+iteration=0
+
+# AutoRegressive Decoding
+# Loop through each model pair, dataset, request rate, and draft size combination
+for model_pair in "${models[@]}"; do
+  IFS=',' read -r target_model draft_model <<< "$model_pair"
+  for dataset in "${datasets[@]}"; do
+    for request_rate in "${request_rates[@]}"; do
+      for draft_size in "${draft_sizes_ar[@]}"; do
+        iteration=$((iteration + 1))
+        ./slack "Progress: $iteration/$total_iterations"
+        echo "Running AR benchmark with target model: $target_model, draft model: $draft_model, dataset: $dataset, request rate: $request_rate, draft size: $draft_size"
+        
+        # Run the benchmark script and append the output to the CSV file
+        python3 baseline_ar_chunked_prefill.py \
+            --dataset "$dataset" \
+            --target-model "$target_model" \
+            --draft-model "$draft_model" \
+            --draft-size "$draft_size" \
+            --request-rate "$request_rate" | grep "Result" >> "$output_csv"
+        
+        echo "Saved results for target model: $target_model, draft model: $draft_model, dataset: $dataset, request rate: $request_rate, draft size: $draft_size"
+      done
+    done
+  done
+done
+
+echo "All AutoRegressive benchmarks completed."
+
+# # Speculative Decoding
+# # Loop through each model pair, dataset, temperature, request rate, and draft size combination
+# for model_pair in "${models[@]}"; do
+#   IFS=',' read -r target_model draft_model <<< "$model_pair"
+#   for dataset in "${datasets[@]}"; do
+#     for temperature in "${temperatures[@]}"; do
+#       for request_rate in "${request_rates[@]}"; do
+#         for draft_size in "${draft_sizes_speculative[@]}"; do
+#           iteration=$((iteration + 1))
+#           ./slack "Progress: $iteration/$total_iterations"
+#           echo "Running Speculative benchmark with target model: $target_model, draft model: $draft_model, dataset: $dataset, temperature: $temperature, request rate: $request_rate, draft size: $draft_size"
+          
+#           # Run the benchmark script and append the output to the CSV file
+#           python3 baseline.py \
+#               --dataset "$dataset" \
+#               --target-model "$target_model" \
+#               --draft-model "$draft_model" \
+#               --draft-size "$draft_size" \
+#               --temperature "$temperature" \
+#               --request-rate "$request_rate" | grep "Result" >> "$output_csv"
+          
+#           echo "Saved results for target model: $target_model, draft model: $draft_model, dataset: $dataset, temperature: $temperature, request rate: $request_rate, draft size: $draft_size"
+#         done
+#       done
+#     done
+#   done
+# done
+
+# echo "All Speculative Decoding benchmarks completed."
+
+# # Disable Speculative Decoding by Batch Size
+# # Define disable size array
+# disable_size=(32 64 128)
+
+# # Loop through each model pair, request rate, draft size, and disable size combination
+# for model_pair in "${models[@]}"; do
+#   IFS=',' read -r target_model draft_model <<< "$model_pair"
+#   for request_rate in "${request_rates[@]}"; do
+#     for draft_size in "${draft_sizes_speculative[@]}"; do
+#       for disable in "${disable_size[@]}"; do
+#         iteration=$((iteration + 1))
+#         ./slack "Progress: $iteration/$total_iterations"
+#         echo "Running benchmark with target model: $target_model, draft model: $draft_model, request rate: $request_rate, draft size: $draft_size, disable size: $disable"
+        
+#         # Run the benchmark script and append the output to the CSV file
+#         python3 baseline_specdis.py \
+#             --dataset "$dataset" \
+#             --target-model "$target_model" \
+#             --draft-model "$draft_model" \
+#             --draft-size "$draft_size" \
+#             --temperature "$temperature" \
+#             --speculative-disable-by-batch-size "$disable" \
+#             --request-rate "$request_rate" | grep "Result" >> "$output_csv"
+        
+#         echo "Saved results for target model: $target_model, draft model: $draft_model, request rate: $request_rate, draft size: $draft_size, disable size: $disable"
+#       done
+#     done
+#   done
+# done
+
+# echo "All benchmarks completed."
diff --git a/baseline/run_all.sh b/baseline/run_all.sh
new file mode 100755
index 0000000000000..7d80389580470
--- /dev/null
+++ b/baseline/run_all.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
+
+# Get GPU name using Python script
+gpu_name=$(python3 get_gpu_name.py)
+
+# Cleanup the GPU name
+gpu_name=$(echo "$gpu_name" | tr -d '[:space:]')
+
+echo "GPU: $gpu_name"
+
+# Model pairs to benchmark
+declare -a models=(
+    # "huggyllama/llama-7b,JackFram/llama-68m"
+    # "facebook/opt-6.7b,facebook/opt-125m"
+    "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m"
+)
+
+# Common arguments
+datasets=("finance")
+temperatures=(0.75)
+request_rates=(8)
+draft_sizes_ar=(0)
+draft_sizes_speculative=(1 3 5 7)
+budget_seqs=(64)
+prefill_schedule_mode="full_prefill"
+colocate="False"
+consolidated_attention="False"
+drop_threshold="0"
+budget_token="4096"
+
+# Output CSV file
+output_csv="baseline_pythia_finance_$gpu_name.csv"
+
+# Initialize CSV file
+initialize_csv() {
+    if [ ! -f "$output_csv" ]; then
+        echo "Initializing CSV file: $output_csv"
+        echo "gpu_name,target_model,draft_model,dataset,temperature,request_rate,draft_size,prefill_schedule_mode,budget_token,budget_seq,colocate,consolidated_attention,drop_threshold,p50_ttft,p99_ttft,p50_tpot,p99_tpot,p50_token_latency,p99_token_latency,token_throughput,request_throughput,token_latency,preempt_flag" > "$output_csv"
+    fi
+}
+
+# Initialize the CSV file
+initialize_csv
+
+# Function to check if configuration exists in CSV
+configuration_exists_in_csv() {
+    local gpu_name="$1"
+    local target_model="$2"
+    local draft_model="$3"
+    local dataset="$4"
+    local temperature="$5"
+    local request_rate="$6"
+    local draft_size="$7"
+    local budget_seq="$8"  # Add budget_seq parameter
+
+    # Skip header line (NR > 1)
+    if awk -v OFS=',' -F', *' -v gpu_name="$gpu_name" \
+            -v target_model="$target_model" \
+            -v draft_model="$draft_model" \
+            -v dataset="$dataset" \
+            -v temperature="$temperature" \
+            -v request_rate="$request_rate" \
+            -v draft_size="$draft_size" \
+            -v budget_seq="$budget_seq" '  # Include budget_seq in the check
+        NR > 1 {
+            for (i=1; i<=NF; i++) { gsub(/^ +| +$/, "", $i) }
+            if ($1 == gpu_name && $2 == target_model && $3 == draft_model && $4 == dataset && $5 == temperature && $6 == request_rate && $7 == draft_size && $10 == budget_seq) {
+                found = 1; exit
+            }
+        }
+        END { exit !found }
+    ' "$output_csv"; then
+        return 0  # Configuration exists
+    else
+        return 1  # Configuration does not exist
+    fi
+}
+
+
+# Function to extract values from the benchmark output
+extract_values() {
+    local log_file="$1"
+    local result_line=$(grep 'Result' "$log_file")
+    if [ -z "$result_line" ]; then
+        echo "Error: No 'result' line found in output."
+        return 1
+    fi
+    IFS=', ' read -ra metrics <<< "$result_line"
+    request_throughput="${metrics[1]}"
+    token_throughput="${metrics[2]}"
+    token_latency="${metrics[3]}"
+    p50_ttft="${metrics[4]}"
+    p99_ttft="${metrics[5]}"
+    p50_tpot="${metrics[6]}"
+    p99_tpot="${metrics[7]}"
+    p50_token_latency="${metrics[8]}"
+    p99_token_latency="${metrics[9]}"
+    preempt_flag="${metrics[10]}"
+}
+
+# AutoRegressive Decoding
+for budget_seq in ${budget_seqs[@]}; do
+    for model_pair in "${models[@]}"; do
+        IFS=',' read -r target_model draft_model <<< "$model_pair"
+        for dataset in "${datasets[@]}"; do    
+            for request_rate in "${request_rates[@]}"; do
+                for draft_size in "${draft_sizes_ar[@]}"; do
+                    temperature="0"  # AR decoding always uses temperature 0
+                    log_file="logs/AR_${target_model}_${draft_model}_${dataset}_${request_rate}_${draft_size}_${budget_seq}.log"
+                    mkdir -p $(dirname "$log_file")
+
+                    # Check if configuration already exists
+                    if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then
+                        echo "Configuration already exists in CSV. Skipping."
+                        continue
+                    fi
+
+                    # Run the benchmark
+                    python3 baseline_ar.py \
+                        --dataset "$dataset" \
+                        --target-model "$target_model" \
+                        --draft-model "$draft_model" \
+                        --draft-size "$draft_size" \
+                        --budget-token "$budget_token" \
+                        --budget-seq "$budget_seq" \
+                        --request-rate "$request_rate" > "$log_file" 2>&1
+
+                    # Extract values from the log file
+                    if ! extract_values "$log_file"; then
+                        echo "Failed to extract values. Skipping."
+                        continue
+                    fi
+
+                    # Append results to CSV
+                    echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv"
+                done
+            done
+        done
+    done
+done
+
+# Speculative Decoding
+for budget_seq in ${budget_seqs[@]}; do
+    for model_pair in "${models[@]}"; do
+        IFS=',' read -r target_model draft_model <<< "$model_pair"
+        for dataset in "${datasets[@]}"; do
+            for temperature in "${temperatures[@]}"; do
+                for request_rate in "${request_rates[@]}"; do
+                    for draft_size in "${draft_sizes_speculative[@]}"; do
+                        log_file="logs/speculative_${target_model}_${draft_model}_${dataset}_${temperature}_${request_rate}_${draft_size}_${budget_seq}.log"
+                        mkdir -p $(dirname "$log_file")
+
+                        # Check if configuration already exists
+                        if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then
+                            echo "Configuration already exists in CSV. Skipping."
+                            continue
+                        fi
+
+                        # Run the benchmark
+                        python3 baseline.py \
+                            --dataset "$dataset" \
+                            --target-model "$target_model" \
+                            --draft-model "$draft_model" \
+                            --draft-size "$draft_size" \
+                            --budget-seq "$budget_seq" \
+                            --budget-token "$budget_token" \
+                            --temperature "$temperature" \
+                            --request-rate "$request_rate" > "$log_file" 2>&1
+
+                        # Extract values from the log file
+                        if ! extract_values "$log_file"; then
+                            echo "Failed to extract values. Skipping."
+                            continue
+                        fi
+
+                        # Append results to CSV
+                        echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv"
+                    done
+                done
+            done
+        done
+    done
+done
+
+echo "All benchmarks completed."
diff --git a/baseline/run_all_A100.sh b/baseline/run_all_A100.sh
new file mode 100755
index 0000000000000..39029c09b1023
--- /dev/null
+++ b/baseline/run_all_A100.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
+
+# Get GPU name using Python script
+gpu_name=$(python3 get_gpu_name.py)
+
+# Cleanup the GPU name
+gpu_name=$(echo "$gpu_name" | tr -d '[:space:]')
+
+echo "GPU: $gpu_name"
+
+# Model pairs to benchmark
+declare -a models=(
+    # "huggyllama/llama-7b,JackFram/llama-68m"
+    "facebook/opt-6.7b,facebook/opt-125m"
+    # "EleutherAI/pythia-6.9b,EleutherAI/pythia-160m"
+)
+
+# Common arguments
+datasets=("finance")
+temperatures=(0)
+request_rates=(16)
+draft_sizes_ar=(0)
+draft_sizes_speculative=(1 3 5 7)
+budget_seqs=(128)
+prefill_schedule_mode="full_prefill"
+colocate="False"
+consolidated_attention="False"
+drop_threshold="0"
+budget_token="4096"
+
+# Output CSV file
+output_csv="baseline_pythia_sharegpt_$gpu_name.csv"
+
+# Initialize CSV file
+initialize_csv() {
+    if [ ! -f "$output_csv" ]; then
+        echo "Initializing CSV file: $output_csv"
+        echo "gpu_name,target_model,draft_model,dataset,temperature,request_rate,draft_size,prefill_schedule_mode,budget_token,budget_seq,colocate,consolidated_attention,drop_threshold,p50_ttft,p99_ttft,p50_tpot,p99_tpot,p50_token_latency,p99_token_latency,token_throughput,request_throughput,token_latency,preempt_flag" > "$output_csv"
+    fi
+}
+
+# Initialize the CSV file
+initialize_csv
+
+# Function to check if configuration exists in CSV
+configuration_exists_in_csv() {
+    local gpu_name="$1"
+    local target_model="$2"
+    local draft_model="$3"
+    local dataset="$4"
+    local temperature="$5"
+    local request_rate="$6"
+    local draft_size="$7"
+    local budget_seq="$8"  # Add budget_seq parameter
+
+    # Skip header line (NR > 1)
+    if awk -v OFS=',' -F', *' -v gpu_name="$gpu_name" \
+            -v target_model="$target_model" \
+            -v draft_model="$draft_model" \
+            -v dataset="$dataset" \
+            -v temperature="$temperature" \
+            -v request_rate="$request_rate" \
+            -v draft_size="$draft_size" \
+            -v budget_seq="$budget_seq" '  # Include budget_seq in the check
+        NR > 1 {
+            for (i=1; i<=NF; i++) { gsub(/^ +| +$/, "", $i) }
+            if ($1 == gpu_name && $2 == target_model && $3 == draft_model && $4 == dataset && $5 == temperature && $6 == request_rate && $7 == draft_size && $10 == budget_seq) {
+                found = 1; exit
+            }
+        }
+        END { exit !found }
+    ' "$output_csv"; then
+        return 0  # Configuration exists
+    else
+        return 1  # Configuration does not exist
+    fi
+}
+
+
+# Function to extract values from the benchmark output
+extract_values() {
+    local log_file="$1"
+    local result_line=$(grep 'Result' "$log_file")
+    if [ -z "$result_line" ]; then
+        echo "Error: No 'result' line found in output."
+        return 1
+    fi
+    IFS=', ' read -ra metrics <<< "$result_line"
+    request_throughput="${metrics[1]}"
+    token_throughput="${metrics[2]}"
+    token_latency="${metrics[3]}"
+    p50_ttft="${metrics[4]}"
+    p99_ttft="${metrics[5]}"
+    p50_tpot="${metrics[6]}"
+    p99_tpot="${metrics[7]}"
+    p50_token_latency="${metrics[8]}"
+    p99_token_latency="${metrics[9]}"
+    preempt_flag="${metrics[10]}"
+}
+
+# AutoRegressive Decoding
+for budget_seq in ${budget_seqs[@]}; do
+    for model_pair in "${models[@]}"; do
+        IFS=',' read -r target_model draft_model <<< "$model_pair"
+        for dataset in "${datasets[@]}"; do    
+            for request_rate in "${request_rates[@]}"; do
+                for draft_size in "${draft_sizes_ar[@]}"; do
+                    temperature="0"  # AR decoding always uses temperature 0
+                    log_file="logs/AR_${target_model}_${draft_model}_${dataset}_${request_rate}_${draft_size}_${budget_seq}.log"
+                    mkdir -p $(dirname "$log_file")
+
+                    # Check if configuration already exists
+                    if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then
+                        echo "Configuration already exists in CSV. Skipping."
+                        continue
+                    fi
+
+                    # Run the benchmark
+                    python3 baseline_ar.py \
+                        --dataset "$dataset" \
+                        --target-model "$target_model" \
+                        --draft-model "$draft_model" \
+                        --draft-size "$draft_size" \
+                        --budget-token "$budget_token" \
+                        --budget-seq "$budget_seq" \
+                        --request-rate "$request_rate" > "$log_file" 2>&1
+
+                    # Extract values from the log file
+                    if ! extract_values "$log_file"; then
+                        echo "Failed to extract values. Skipping."
+                        continue
+                    fi
+
+                    # Append results to CSV
+                    echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv"
+                done
+            done
+        done
+    done
+done
+
+# Speculative Decoding
+# for budget_seq in ${budget_seqs[@]}; do
+#     for model_pair in "${models[@]}"; do
+#         IFS=',' read -r target_model draft_model <<< "$model_pair"
+#         for dataset in "${datasets[@]}"; do
+#             for temperature in "${temperatures[@]}"; do
+#                 for request_rate in "${request_rates[@]}"; do
+#                     for draft_size in "${draft_sizes_speculative[@]}"; do
+#                         log_file="logs/speculative_${target_model}_${draft_model}_${dataset}_${temperature}_${request_rate}_${draft_size}_${budget_seq}.log"
+#                         mkdir -p $(dirname "$log_file")
+
+#                         # Check if configuration already exists
+#                         if configuration_exists_in_csv "$gpu_name" "$target_model" "$draft_model" "$dataset" "$temperature" "$request_rate" "$draft_size" "$budget_seq"; then
+#                             echo "Configuration already exists in CSV. Skipping."
+#                             continue
+#                         fi
+
+#                         # Run the benchmark
+#                         python3 baseline.py \
+#                             --dataset "$dataset" \
+#                             --target-model "$target_model" \
+#                             --draft-model "$draft_model" \
+#                             --draft-size "$draft_size" \
+#                             --budget-seq "$budget_seq" \
+#                             --budget-token "$budget_token" \
+#                             --temperature "$temperature" \
+#                             --request-rate "$request_rate" > "$log_file" 2>&1
+
+#                         # Extract values from the log file
+#                         if ! extract_values "$log_file"; then
+#                             echo "Failed to extract values. Skipping."
+#                             continue
+#                         fi
+
+#                         # Append results to CSV
+#                         echo "$gpu_name,$target_model,$draft_model,$dataset,$temperature,$request_rate,$draft_size,$prefill_schedule_mode,$budget_token,$budget_seq,$colocate,$consolidated_attention,$drop_threshold,$p50_ttft,$p99_ttft,$p50_tpot,$p99_tpot,$p50_token_latency,$p99_token_latency,$token_throughput,$request_throughput,$token_latency,$preempt_flag" >> "$output_csv"
+#                     done
+#                 done
+#             done
+#         done
+#     done
+# done
+
+# echo "All benchmarks completed."