From 324fcec9c6e3166d0420160a0142024301ad0372 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 26 Aug 2024 12:58:54 -0700 Subject: [PATCH] computel ast --- vllm/worker/model_runner.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1414f77baa4d5..665d9dc57244b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -531,18 +531,16 @@ def _compute_for_prefix_cache_hit( inter_data.query_lens[ seq_idx] = inter_data.seq_lens[seq_idx] - context_len elif seq_len <= prefix_cache_len: - # Full hit. Only compute the last block to avoid + # Full hit. Only compute the last token to avoid # erroneous behavior. FIXME: Ideally we should directly # mark all tokens as computed in the scheduler and do not # schedule this sequence, so this case should not happen. - block_size = self.block_size inter_data.input_tokens[seq_idx] = inter_data.input_tokens[ - seq_idx][-block_size:] + seq_idx][-1:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ - seq_idx][-block_size:] - inter_data.query_lens[seq_idx] = block_size - inter_data.context_lens[seq_idx] = inter_data.seq_lens[ - seq_idx] - inter_data.query_lens[seq_idx] + seq_idx][-1:] + inter_data.query_lens[seq_idx] = 1 + inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup, seq_idx: int,