diff --git a/python/flashinfer/decode.py b/python/flashinfer/decode.py index 643416dd..5215ae45 100644 --- a/python/flashinfer/decode.py +++ b/python/flashinfer/decode.py @@ -719,7 +719,7 @@ def plan( ) self._qo_indptr_buf = qo_indptr_host.to(self.device, non_blocking=True) - indptr_host = indptr.to("cpu", non_blocking=True) + indptr_host = indptr.to("cpu") if data_type is not None: q_data_type = data_type kv_data_type = data_type diff --git a/python/flashinfer/prefill.py b/python/flashinfer/prefill.py index 5922d71a..bfb8f48e 100644 --- a/python/flashinfer/prefill.py +++ b/python/flashinfer/prefill.py @@ -1004,8 +1004,8 @@ def plan( self._qk_indptr_buf = qk_indptr.to(self.device, non_blocking=True) # NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors - qo_indptr_host = qo_indptr.to("cpu", non_blocking=True) - paged_kv_indptr_host = paged_kv_indptr.to("cpu", non_blocking=True) + qo_indptr_host = qo_indptr.to("cpu") + paged_kv_indptr_host = paged_kv_indptr.to("cpu") if packed_custom_mask is not None: mask_mode = MaskMode.CUSTOM.value @@ -1571,8 +1571,8 @@ def plan( self._qk_indptr_buf = qk_indptr.to(self.device) # NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors - qo_indptr_host = qo_indptr.to("cpu", non_blocking=True) - kv_indptr_host = kv_indptr.to("cpu", non_blocking=True) + qo_indptr_host = qo_indptr.to("cpu") + kv_indptr_host = kv_indptr.to("cpu") if packed_custom_mask is not None: mask_mode = MaskMode.CUSTOM.value diff --git a/python/flashinfer/sparse.py b/python/flashinfer/sparse.py index 55d36e06..7b8c8e4e 100644 --- a/python/flashinfer/sparse.py +++ b/python/flashinfer/sparse.py @@ -298,7 +298,7 @@ def plan( self.R = R self.C = C - kv_indptr_host = indptr.to("cpu", non_blocking=True) + kv_indptr_host = indptr.to("cpu") # NOTE(Zihao): we haven't supported mask in cuda-core implementations but it should # be easy to add support for it if needed, leave it as a future work.