Skip to content

Commit

Permalink
bugfix: do not use non-blocking copy for gpu to cpu transfer (#564)
Browse files Browse the repository at this point in the history
If we use async GPU to CPU copy in `plan` functions, we synchronize
before we use the cpu array.
Since we have removed synchronization in plan functions, the GPU to CPU
copy should be synchronized.

For flashinfer v0.2, it's encouraged to pass cpu indptr arrays to `plan`
functions, and the synchronized GPU to CPU copy will be a no-op in this
case.
  • Loading branch information
yzh119 authored Oct 27, 2024
1 parent 4800368 commit d30667b
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 6 deletions.
2 changes: 1 addition & 1 deletion python/flashinfer/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ def plan(
)
self._qo_indptr_buf = qo_indptr_host.to(self.device, non_blocking=True)

indptr_host = indptr.to("cpu", non_blocking=True)
indptr_host = indptr.to("cpu")
if data_type is not None:
q_data_type = data_type
kv_data_type = data_type
Expand Down
8 changes: 4 additions & 4 deletions python/flashinfer/prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,8 +1004,8 @@ def plan(
self._qk_indptr_buf = qk_indptr.to(self.device, non_blocking=True)

# NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors
qo_indptr_host = qo_indptr.to("cpu", non_blocking=True)
paged_kv_indptr_host = paged_kv_indptr.to("cpu", non_blocking=True)
qo_indptr_host = qo_indptr.to("cpu")
paged_kv_indptr_host = paged_kv_indptr.to("cpu")

if packed_custom_mask is not None:
mask_mode = MaskMode.CUSTOM.value
Expand Down Expand Up @@ -1571,8 +1571,8 @@ def plan(
self._qk_indptr_buf = qk_indptr.to(self.device)

# NOTE(Zihao): only required if qo_indptr/paged_kv_indptr are device tensors
qo_indptr_host = qo_indptr.to("cpu", non_blocking=True)
kv_indptr_host = kv_indptr.to("cpu", non_blocking=True)
qo_indptr_host = qo_indptr.to("cpu")
kv_indptr_host = kv_indptr.to("cpu")

if packed_custom_mask is not None:
mask_mode = MaskMode.CUSTOM.value
Expand Down
2 changes: 1 addition & 1 deletion python/flashinfer/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def plan(
self.R = R
self.C = C

kv_indptr_host = indptr.to("cpu", non_blocking=True)
kv_indptr_host = indptr.to("cpu")

# NOTE(Zihao): we haven't supported mask in cuda-core implementations but it should
# be easy to add support for it if needed, leave it as a future work.
Expand Down

0 comments on commit d30667b

Please sign in to comment.