Skip to content

Commit

Permalink
computel ast
Browse files Browse the repository at this point in the history
  • Loading branch information
comaniac committed Aug 26, 2024
1 parent d57951f commit 324fcec
Showing 1 changed file with 5 additions and 7 deletions.
12 changes: 5 additions & 7 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,18 +531,16 @@ def _compute_for_prefix_cache_hit(
inter_data.query_lens[
seq_idx] = inter_data.seq_lens[seq_idx] - context_len
elif seq_len <= prefix_cache_len:
# Full hit. Only compute the last block to avoid
# Full hit. Only compute the last token to avoid
# erroneous behavior. FIXME: Ideally we should directly
# mark all tokens as computed in the scheduler and do not
# schedule this sequence, so this case should not happen.
block_size = self.block_size
inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
seq_idx][-block_size:]
seq_idx][-1:]
inter_data.input_positions[seq_idx] = inter_data.input_positions[
seq_idx][-block_size:]
inter_data.query_lens[seq_idx] = block_size
inter_data.context_lens[seq_idx] = inter_data.seq_lens[
seq_idx] - inter_data.query_lens[seq_idx]
seq_idx][-1:]
inter_data.query_lens[seq_idx] = 1
inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1

def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
seq_idx: int,
Expand Down

0 comments on commit 324fcec

Please sign in to comment.