From 5ba0e125d993228654f1ba801cdd05944863bf2f Mon Sep 17 00:00:00 2001
From: yzh119 <expye@outlook.com>
Date: Sun, 3 Mar 2024 11:43:32 +0000
Subject: [PATCH] upd doc

---
 README.md                    |  4 ++--
 python/flashinfer/cascade.py |  3 ---
 python/flashinfer/decode.py  | 12 ++++++------
 python/flashinfer/prefill.py | 12 ++++++------
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index f97f3bdd..adeffef6 100644
--- a/README.md
+++ b/README.md
@@ -72,13 +72,13 @@ num_qo_heads = 32
 q = torch.randn(num_qo_heads, head_dim).half().to(0)
 
 o = flashinfer.single_decode_with_kv_cache(q, k, v) # decode attention without RoPE on-the-fly
-o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="LLAMA") # decode with LLaMA style RoPE on-the-fly
+o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="ROPE_LLAMA") # decode with LLaMA style RoPE on-the-fly
 
 # append attention
 append_qo_len = 128
 q = torch.randn(append_qo_len, num_qo_heads, head_dim).half().to(0) # append attention, the last 128 tokens in the KV-Cache are the new tokens
 o = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True) # append attention without RoPE on-the-fly, apply causal mask
-o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask
+o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="ROPE_LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask
 
 # prefill attention
 qo_len = 2048
diff --git a/python/flashinfer/cascade.py b/python/flashinfer/cascade.py
index 3054c04a..52dd3161 100644
--- a/python/flashinfer/cascade.py
+++ b/python/flashinfer/cascade.py
@@ -419,9 +419,6 @@ def begin_forward(
             The dimension of the heads
         page_size : int
             The page size of the paged kv cache
-        pos_encoding_mode : str
-            Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
         data_type : Union[str, torch.dtype]
             The data type of the paged kv cache
 
diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py
index 66ab7ed1..8b87a6c4 100644
--- a/python/flashinfer/decode.py
+++ b/python/flashinfer/decode.py
@@ -79,7 +79,7 @@ def single_decode_with_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -168,7 +168,7 @@ def batch_decode_with_padded_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -257,7 +257,7 @@ def batch_decode_with_padded_kv_cache_return_lse(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     sm_scale : Optional[float]
         The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
     rope_scale : Optional[float]
@@ -456,7 +456,7 @@ def begin_forward(
             The page size of the paged kv cache
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         data_type : Union[str, torch.dtype]
             The data type of the paged kv cache
 
@@ -525,7 +525,7 @@ def forward(
             :attr:`kv_layout` is ``HND``.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         sm_scale : Optional[float]
             The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
         rope_scale : Optional[float]
@@ -586,7 +586,7 @@ def forward_return_lse(
             :attr:`kv_layout` is ``HND``.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         sm_scale : Optional[float]
             The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``.
         rope_scale : Optional[float]
diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py
index bf780c84..cc969d50 100644
--- a/python/flashinfer/prefill.py
+++ b/python/flashinfer/prefill.py
@@ -93,7 +93,7 @@ def single_prefill_with_kv_cache(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     allow_fp16_qk_reduction : bool
         Whether to use f16 for qk reduction (faster at the cost of slight precision
         loss).
@@ -191,7 +191,7 @@ def single_prefill_with_kv_cache_return_lse(
         The layout of the input k/v tensors, could be either ``NHD`` or ``HND``.
     pos_encoding_mode : str
         Whether to apply RoPE on-the-fly inside attention kernels, could be
-        ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+        ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
     allow_fp16_qk_reduction : bool
         Whether to use f16 for qk reduction (faster at the cost of slight precision
         loss).
@@ -460,7 +460,7 @@ def forward(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -529,7 +529,7 @@ def forward_return_lse(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -744,7 +744,7 @@ def forward(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).
@@ -811,7 +811,7 @@ def forward_return_lse(
             Whether to apply causal mask to the attention matrix.
         pos_encoding_mode : str
             Whether to apply RoPE on-the-fly inside attention kernels, could be
-            ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding).
+            ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``.
         allow_fp16_qk_reduction : bool
             Whether to use f16 for qk reduction (faster at the cost of slight precision
             loss).