vllm-project · zhuohan123 · Feb 16, 2024 · Jan 20, 2024 · Jan 20, 2024 · Jan 20, 2024
diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py
@@ -5,6 +5,8 @@
 import triton
 import triton.language as tl
 
+TESLA = 'Tesla' in torch.cuda.get_device_name(0)
+
 if triton.__version__ >= "2.1.0":
 
     @triton.jit
@@ -618,7 +620,8 @@ def context_attention_fwd(q,
                               b_ctx_len,
                               max_input_len,
                               alibi_slopes=None):
-        BLOCK = 128
+
+        BLOCK = 128 if not TESLA else 64
         # shape constraints
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         assert Lq == Lk and Lk == Lv