vllm-project · rkooo567 · May 23, 2024
@@ -765,7 +765,10 @@ def _get_stats(
 
             for idx, scheduled_seq_group in enumerate(
                     scheduler_outputs.scheduled_seq_groups):
+                # print(f"SANG-TODO {scheduler_outputs.num_prefill_groups=}")
                 group_was_prefill = idx < scheduler_outputs.num_prefill_groups
+                # print(f"SANG-TODO {group_was_prefill=}")
+                # print(f"SANG-TODO {idx=}")
                 seq_group = scheduled_seq_group.seq_group
 
                 # NOTE: a seq_group that completed all of its prefill tokens

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -1174,6 +1174,7 @@ def _get_logits(
                            nan=float("-inf"),
                            posinf=float("inf"),
                            neginf=float("-inf")))
+        # print(f"SANG-TODO { logits[:,self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]].shape=} {lora_logits.shape=} {self.indices_padded[:self.indices_len[2]]=} {hidden_states.shape=}")
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
                lora_logits.shape[1]] = lora_logits

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
@@ -144,6 +144,7 @@ def convert_mapping(
     if long_lora_indices_len is not None:
         indices_len.append(long_lora_indices_len)
 
+    # print(f"{mapping.prompt_mapping=} {sampler_indices=} {sampler_indices_padded=} {indices_len=}")
     return (base_indices, sampler_indices, sampler_indices_padded,
             embeddings_indices, long_lora_indices, indices_len)
 

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -379,12 +379,19 @@ def _prepare_model_input(
                     lora_requests.add(seq_group_metadata.lora_request)
 
                 lora_index_mapping += [lora_id] * (seq_len - context_len)
-                lora_prompt_mapping.extend(
-                    [lora_id] *
-                    (seq_len -
-                     context_len if seq_group_metadata.sampling_params
-                     and seq_group_metadata.sampling_params.prompt_logprobs
-                     else 1))
+                if (seq_group_metadata.sampling_params
+                        and seq_group_metadata.sampling_params.prompt_logprobs):
+                    lora_prompt_mapping.extend([lora_id] * (seq_len - context_len))
+                else:
+                    if seq_group_metadata.do_sample:
+                        lora_prompt_mapping.append(lora_id)
+                # lora_prompt_mapping.extend(
+                #     [lora_id] *
+                #     (seq_len -
+                #      context_len if seq_group_metadata.sampling_params
+                #      and seq_group_metadata.sampling_params.prompt_logprobs
+                #      else 1))
+                # print(f"{len(lora_prompt_mapping)=}")
 
                 if seq_group_metadata.multi_modal_data:
                     multi_modal_input_list.append(
@@ -675,6 +682,7 @@ def execute_model(
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
          lora_requests, lora_mapping, multi_modal_input
          ) = self.prepare_input_tensors(seq_group_metadata_list)
+        # print(f"{input_tokens.shape=}")
 
         if self.lora_config:
             self.set_active_loras(lora_requests, lora_mapping)