Merge branch 'main' into tyler/monkeypatch-qwen2vl

linkedin · Sep 8, 2024 · 292fef0 · 292fef0
2 parents cf90fdd + 9250546
commit 292fef0
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 137 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,7 @@ build/
 dist/
 
 # Lockfiles
-uv.lock
+uv.lock
+
+# Benchmark images
+benchmark/visualizations
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -55,7 +55,7 @@ The `/benchmark` directory contains benchmarking scripts for the individual kern
    - Existing entries that are the same (based on `kernel_name`, `kernel_provider`, `kernel_operation_mode`, `metric_name`, `x_name`, `x_value`, `extra_benchmark_config_str`, and `gpu_name`) will not be overwritten.
 2. Run `make run-benchmarks OVERWRITE=1` to overwrite any existing entries that have the same configuration.
 3. Run `python benchmark/scripts/benchmark_{kernel_name}.py` to run an individual benchmark.
-4. You can use the `benchmark/benchmarks_visualizer.ipynb` notebook as an example to load the CSV and perform data visualization/analysis.
+4. You can use the `benchmark/benchmarks_visualizer.py` script to generate visualizations from the CSV, these are then saved to the `benchmark/visualizations` directory (note: this directory is not tracked by git).
 
 ## Submit PR
 Fork the repo, copy and paste the successful test logs in the PR and submit the PR followed by the PR template (**[example PR](https://github.com/linkedin/Liger-Kernel/pull/21)**).

diff --git a/README.md b/README.md
@@ -132,7 +132,7 @@ pip install -e .
 ```
 ## Getting Started
 
-There are a couple ways to apply Liger kernels, depending on the level of customization required.
+There are a couple of ways to apply Liger kernels, depending on the level of customization required.
 
 ### 1. Use AutoLigerKernelForCausalLM
 
@@ -243,6 +243,7 @@ loss.backward()
 | GeGLU                           | `liger_kernel.transformers.LigerGEGLUMLP`                   |
 | CrossEntropy                    | `liger_kernel.transformers.LigerCrossEntropyLoss`           |
 | FusedLinearCrossEntropy         | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
+| KLDivergence                    | `liger_kernel.transformers.LigerKLDIVLoss`                  |
 
 - **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
 - **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
@@ -256,7 +257,7 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
 - **CrossEntropy**: [Cross entropy loss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) is implemented by computing both the loss and gradient in the forward pass with inplace replacement of input to reduce the peak memory by avoiding simultaneous materialization of both input logits and gradient. It achieves >2X speedup and >4X memory reduction for common vocab sizes (e.g., 32K, 128K, etc.).
 <!-- TODO: verify vocab sizes are accurate  -->
 - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
-
+- **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
 
 ### Experimental Kernels
 

diff --git a/benchmark/benchmarks_visualizer.ipynb b/benchmark/benchmarks_visualizer.ipynb
diff --git a/benchmark/benchmarks_visualizer.py b/benchmark/benchmarks_visualizer.py
@@ -0,0 +1,169 @@
+import json
+import os
+from argparse import ArgumentParser
+from dataclasses import dataclass
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+DATA_PATH = "data/all_benchmark_data.csv"
+VISUALIZATIONS_PATH = "visualizations/"
+
+
+@dataclass
+class VisualizationsConfig:
+    """
+    Configuration for the visualizations script.
+
+    Args:
+        kernel_name (str): Kernel name to benchmark. (Will run `scripts/benchmark_{kernel_name}.py`)
+        metric_name (str): Metric name to visualize (speed/memory)
+        kernel_operation_mode (str): Kernel operation mode to visualize (forward/backward/full). Defaults to "full"
+        display (bool): Display the visualization. Defaults to False
+        overwrite (bool): Overwrite existing visualization, if none exist this flag has no effect as ones are always created and saved. Defaults to False
+
+    """
+
+    kernel_name: str
+    metric_name: str
+    kernel_operation_mode: str = "full"
+    display: bool = False
+    overwrite: bool = False
+
+
+def parse_args() -> VisualizationsConfig:
+    """Parse command line arguments into a configuration object.
+
+    Returns:
+        VisualizationsConfig: Configuration object for the visualizations script.
+    """
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--kernel-name", type=str, required=True, help="Kernel name to benchmark"
+    )
+    parser.add_argument(
+        "--metric-name",
+        type=str,
+        required=True,
+        help="Metric name to visualize (speed/memory)",
+    )
+    parser.add_argument(
+        "--kernel-operation-mode",
+        type=str,
+        required=True,
+        help="Kernel operation mode to visualize (forward/backward/full)",
+    )
+    parser.add_argument(
+        "--display", action="store_true", help="Display the visualization"
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing visualization, if none exist this flag has no effect as one are always created",
+    )
+
+    args = parser.parse_args()
+
+    return VisualizationsConfig(**dict(args._get_kwargs()))
+
+
+def load_data(config: VisualizationsConfig) -> pd.DataFrame:
+    """Loads the benchmark data from the CSV file and filters it based on the configuration.
+
+    Args:
+        config (VisualizationsConfig): Configuration object for the visualizations script.
+
+    Raises:
+        ValueError: If no data is found for the given filters.
+
+    Returns:
+        pd.DataFrame: Filtered benchmark dataframe.
+    """
+    df = pd.read_csv(DATA_PATH)
+    df["extra_benchmark_config"] = df["extra_benchmark_config_str"].apply(json.loads)
+
+    filtered_df = df[
+        (df["kernel_name"] == config.kernel_name)
+        & (df["metric_name"] == config.metric_name)
+        & (df["kernel_operation_mode"] == config.kernel_operation_mode)
+        # Use this to filter by extra benchmark configuration property
+        # & (data['extra_benchmark_config'].apply(lambda x: x.get('H') == 4096))
+        # FIXME: maybe add a way to filter using some configuration, except of hardcoding it
+    ]
+
+    if filtered_df.empty:
+        raise ValueError("No data found for the given filters")
+
+    return filtered_df
+
+
+def plot_data(df: pd.DataFrame, config: VisualizationsConfig):
+    """Plots the benchmark data, saving the result if needed.
+
+    Args:
+        df (pd.DataFrame): Filtered benchmark dataframe.
+        config (VisualizationsConfig): Configuration object for the visualizations script.
+    """
+    xlabel = df["x_label"].iloc[0]
+    ylabel = f"{config.metric_name} ({df['metric_unit'].iloc[0]})"
+    # Sort by "kernel_provider" to ensure consistent color assignment
+    df = df.sort_values(by="kernel_provider")
+
+    plt.figure(figsize=(10, 6))
+    sns.set(style="whitegrid")
+    ax = sns.lineplot(
+        data=df,
+        x="x_value",
+        y="y_value_50",
+        hue="kernel_provider",
+        marker="o",
+        palette="tab10",
+        errorbar=("ci", None),
+    )
+
+    # Seaborn can't plot pre-computed error bars, so we need to do it manually
+    lines = ax.get_lines()
+    colors = [line.get_color() for line in lines]
+
+    for (_, group_data), color in zip(df.groupby("kernel_provider"), colors):
+        # for i, row in group_data.iterrows():
+        y_error_lower = group_data["y_value_50"] - group_data["y_value_20"]
+        y_error_upper = group_data["y_value_80"] - group_data["y_value_50"]
+        y_error = [y_error_lower, y_error_upper]
+
+        plt.errorbar(
+            group_data["x_value"],
+            group_data["y_value_50"],
+            yerr=y_error,
+            fmt="o",
+            color=color,
+            capsize=5,
+        )
+    plt.legend(title="Kernel Provider")
+    plt.xlabel(xlabel)
+    plt.ylabel(ylabel)
+    plt.tight_layout()
+
+    out_path = os.path.join(
+        VISUALIZATIONS_PATH, f"{config.kernel_name}_{config.metric_name}.png"
+    )
+
+    if config.display:
+        plt.show()
+    if config.overwrite or not os.path.exists(
+        out_path
+    ):  # Save the plot if it doesn't exist or if we want to overwrite it
+        os.makedirs(VISUALIZATIONS_PATH, exist_ok=True)
+        plt.savefig(out_path)
+    plt.close()
+
+
+def main():
+    config = parse_args()
+    df = load_data(config)
+    plot_data(df, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dev = [
 
 [tool.setuptools.packages.find]
 where = ["src"]
-include = ["liger_kernel"]
+include = ["liger_kernel", "liger_kernel.*"]
 
 [tool.pytest.ini_options]
 pythonpath = [