[REMOVE ME] Patch _grouped_size_compiled_for_decode_kernels for flash…

…infer v0.2.0 flashinfer-ai/flashinfer#549
james-p-xu · Nov 16, 2024 · eec4854 · eec4854
1 parent 6bb3979
commit eec4854
Showing 1 changed file with 6 additions and 1 deletion.
diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -30,14 +30,19 @@
         BatchPrefillWithRaggedKVCacheWrapper,
     )
     from flashinfer.cascade import merge_state
-    from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 
 
 class WrapperDispatch(Enum):
     SLIDING_WINDOW = auto()
     CROSS_ATTENTION = auto()
 
 
+def _grouped_size_compiled_for_decode_kernels(
+    num_qo_heads: int, num_kv_heads: int
+) -> bool:  # TODO: Remove me! https://github.com/flashinfer-ai/flashinfer/issues/549
+    return (num_qo_heads // num_kv_heads) in [1, 2, 4, 8]
+
+
 class FlashInferAttnBackend(AttentionBackend):
     """Flashinfer attention kernels."""