From 5ba0e125d993228654f1ba801cdd05944863bf2f Mon Sep 17 00:00:00 2001 From: yzh119 Date: Sun, 3 Mar 2024 11:43:32 +0000 Subject: [PATCH] upd doc --- README.md | 4 ++-- python/flashinfer/cascade.py | 3 --- python/flashinfer/decode.py | 12 ++++++------ python/flashinfer/prefill.py | 12 ++++++------ 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index f97f3bdd..adeffef6 100644 --- a/README.md +++ b/README.md @@ -72,13 +72,13 @@ num_qo_heads = 32 q = torch.randn(num_qo_heads, head_dim).half().to(0) o = flashinfer.single_decode_with_kv_cache(q, k, v) # decode attention without RoPE on-the-fly -o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="LLAMA") # decode with LLaMA style RoPE on-the-fly +o_rope_on_the_fly = flashinfer.single_decode_with_kv_cache(q, k, v, pos_encoding_mode="ROPE_LLAMA") # decode with LLaMA style RoPE on-the-fly # append attention append_qo_len = 128 q = torch.randn(append_qo_len, num_qo_heads, head_dim).half().to(0) # append attention, the last 128 tokens in the KV-Cache are the new tokens o = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True) # append attention without RoPE on-the-fly, apply causal mask -o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask +o_rope_on_the_fly = flashinfer.single_prefill_with_kv_cache(q, k, v, causal=True, pos_encoding_mode="ROPE_LLAMA") # append attention with LLaMA style RoPE on-the-fly, apply causal mask # prefill attention qo_len = 2048 diff --git a/python/flashinfer/cascade.py b/python/flashinfer/cascade.py index 3054c04a..52dd3161 100644 --- a/python/flashinfer/cascade.py +++ b/python/flashinfer/cascade.py @@ -419,9 +419,6 @@ def begin_forward( The dimension of the heads page_size : int The page size of the paged kv cache - pos_encoding_mode : str - Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). data_type : Union[str, torch.dtype] The data type of the paged kv cache diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py index 66ab7ed1..8b87a6c4 100644 --- a/python/flashinfer/decode.py +++ b/python/flashinfer/decode.py @@ -79,7 +79,7 @@ def single_decode_with_kv_cache( The layout of the input k/v tensors, could be either ``NHD`` or ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. sm_scale : Optional[float] The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``. rope_scale : Optional[float] @@ -168,7 +168,7 @@ def batch_decode_with_padded_kv_cache( The layout of the input k/v tensors, could be either ``NHD`` or ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. sm_scale : Optional[float] The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``. rope_scale : Optional[float] @@ -257,7 +257,7 @@ def batch_decode_with_padded_kv_cache_return_lse( The layout of the input k/v tensors, could be either ``NHD`` or ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. sm_scale : Optional[float] The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``. rope_scale : Optional[float] @@ -456,7 +456,7 @@ def begin_forward( The page size of the paged kv cache pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. data_type : Union[str, torch.dtype] The data type of the paged kv cache @@ -525,7 +525,7 @@ def forward( :attr:`kv_layout` is ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. sm_scale : Optional[float] The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``. rope_scale : Optional[float] @@ -586,7 +586,7 @@ def forward_return_lse( :attr:`kv_layout` is ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. sm_scale : Optional[float] The scale of softmax, if not provided, will be set to ``1 / sqrt(head_dim)``. rope_scale : Optional[float] diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py index bf780c84..cc969d50 100644 --- a/python/flashinfer/prefill.py +++ b/python/flashinfer/prefill.py @@ -93,7 +93,7 @@ def single_prefill_with_kv_cache( The layout of the input k/v tensors, could be either ``NHD`` or ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss). @@ -191,7 +191,7 @@ def single_prefill_with_kv_cache_return_lse( The layout of the input k/v tensors, could be either ``NHD`` or ``HND``. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss). @@ -460,7 +460,7 @@ def forward( Whether to apply causal mask to the attention matrix. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss). @@ -529,7 +529,7 @@ def forward_return_lse( Whether to apply causal mask to the attention matrix. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss). @@ -744,7 +744,7 @@ def forward( Whether to apply causal mask to the attention matrix. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss). @@ -811,7 +811,7 @@ def forward_return_lse( Whether to apply causal mask to the attention matrix. pos_encoding_mode : str Whether to apply RoPE on-the-fly inside attention kernels, could be - ``NONE`` or ``LLAMA`` (LLAMA style rotary embedding). + ``NONE``/``ROPE_LLAMA``(LLAMA style rotary embedding)/``ALIBI``. allow_fp16_qk_reduction : bool Whether to use f16 for qk reduction (faster at the cost of slight precision loss).