From d30667b0a23c1cc9135f7557404409ca1a9b9f02 Mon Sep 17 00:00:00 2001 From: Zihao Ye Date: Sun, 27 Oct 2024 02:46:22 -0700 Subject: [PATCH] bugfix: do not use non-blocking copy for gpu to cpu transfer (#564) If we use async GPU to CPU copy in `plan` functions, we synchronize before we use the cpu array. Since we have removed synchronization in plan functions, the GPU to CPU copy should be synchronized. For flashinfer v0.2, it's encouraged to pass cpu indptr arrays to `plan` functions, and the synchronized GPU to CPU copy will be a no-op in this case. --- python/flashinfer/decode.py | 2 +- python/flashinfer/prefill.py | 8 ++++---- python/flashinfer/sparse.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py index 643416dd..5215ae45 100644 --- a/python/flashinfer/decode.py +++ b/python/flashinfer/decode.py @@ -719,7 +719,7 @@ def plan( ) self._qo_indptr_buf = qo_indptr_host.to(self.device, non_blocking=True) - indptr_host = indptr.to("cpu", non_blocking=True) + indptr_host = indptr.to("cpu") if data_type is not None: q_data_type = data_type kv_data_type = data_type diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py index 5922d71a..bfb8f48e 100644 --- a/python/flashinfer/prefill.py +++ b/python/flashinfer/prefill.py @@ -1004,8 +1004,8 @@ def plan( self._qk_indptr_buf = qk_indptr.to(self.device, non_blocking=True) # NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors - qo_indptr_host = qo_indptr.to("cpu", non_blocking=True) - paged_kv_indptr_host = paged_kv_indptr.to("cpu", non_blocking=True) + qo_indptr_host = qo_indptr.to("cpu") + paged_kv_indptr_host = paged_kv_indptr.to("cpu") if packed_custom_mask is not None: mask_mode = MaskMode.CUSTOM.value @@ -1571,8 +1571,8 @@ def plan( self._qk_indptr_buf = qk_indptr.to(self.device) # NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors - qo_indptr_host = qo_indptr.to("cpu", non_blocking=True) - kv_indptr_host = kv_indptr.to("cpu", non_blocking=True) + qo_indptr_host = qo_indptr.to("cpu") + kv_indptr_host = kv_indptr.to("cpu") if packed_custom_mask is not None: mask_mode = MaskMode.CUSTOM.value diff --git a/python/flashinfer/sparse.py b/python/flashinfer/sparse.py index 55d36e06..7b8c8e4e 100644 --- a/python/flashinfer/sparse.py +++ b/python/flashinfer/sparse.py @@ -298,7 +298,7 @@ def plan( self.R = R self.C = C - kv_indptr_host = indptr.to("cpu", non_blocking=True) + kv_indptr_host = indptr.to("cpu") # NOTE(Zihao): we haven't supported mask in cuda-core implementations but it should # be easy to add support for it if needed, leave it as a future work.