batchwise processing

vllm-project · njhill · Jul 9, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
commit 449d988ad220b1abfd52334716e445d7106522a5
diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py
@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Tuple
+
+@dataclass
+class AdapterMapping:
+    # Per every token in input_ids:
+    index_mapping: Tuple[int, ...]
+    # Per sampled token:
+    prompt_mapping: Tuple[int, ...]
+
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -8,6 +8,7 @@
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
+from vllm.adapter_commons.layers import AdapterMapping
 from vllm.config import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -134,15 +135,8 @@ def _apply_lora_packed_nslice(
 
 
 @dataclass
-class LoRAMapping:
-    # Per every token in input_ids:
-    index_mapping: Tuple[int, ...]
-    # Per sampled token:
-    prompt_mapping: Tuple[int, ...]
-
-    def __post_init__(self):
-        self.index_mapping = tuple(self.index_mapping)
-        self.prompt_mapping = tuple(self.prompt_mapping)
+class LoRAMapping(AdapterMapping):
+    pass
 
 
 class BaseLayerWithLoRA(nn.Module):

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
@@ -43,7 +43,6 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.prompt_adapter.layers import apply_prompt_adapter
 from vllm.sequence import SamplerOutput
 
 
@@ -279,7 +278,6 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
-        hidden_states = apply_prompt_adapter(self, hidden_states, positions)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]

diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
@@ -39,7 +39,6 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.prompt_adapter.layers import apply_prompt_adapter
 from vllm.sequence import SamplerOutput
 
 
@@ -252,7 +251,6 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         hidden_states = self.word_embeddings(input_ids)
-        hidden_states = apply_prompt_adapter(self, hidden_states, position_ids)
         hidden_states = self.word_embeddings_layernorm(hidden_states)
         for i in range(len(self.h)):
             layer = self.h[i]

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
@@ -39,7 +39,6 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.prompt_adapter.layers import apply_prompt_adapter
 from vllm.sequence import SamplerOutput
 
 
@@ -220,7 +219,6 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         inputs_embeds = self.wte(input_ids)
-        inputs_embeds = apply_prompt_adapter(self, inputs_embeds, position_ids)
         position_embeds = self.wpe(position_ids)
         hidden_states = inputs_embeds + position_embeds
 

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -46,7 +46,6 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.prompt_adapter.layers import apply_prompt_adapter
 from vllm.sequence import SamplerOutput
 from vllm.utils import is_hip, print_warning_once
 
@@ -283,7 +282,6 @@ def forward(
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
-        hidden_states = apply_prompt_adapter(self, hidden_states, positions)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
@@ -51,7 +51,6 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.prompt_adapter.layers import apply_prompt_adapter
 from vllm.sequence import SamplerOutput
 from vllm.utils import print_warning_once
 
@@ -463,7 +462,6 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
-        hidden_states = apply_prompt_adapter(self, hidden_states, positions)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]

diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py
@@ -1,25 +1,56 @@
 from dataclasses import dataclass
-from typing import Tuple
+from typing import Dict, List, Optional
 
+import numpy
 import torch
+from torch import nn
+
+from vllm.adapter_commons.layers import AdapterMapping
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 
 
 @dataclass
-class PromptAdapterMapping:
-    # Per every token in input_ids:
-    index_mapping: Tuple[int, ...]
-    # Per sampled token:
-    prompt_mapping: Tuple[int, ...]
-
-    def __post_init__(self):
-        self.index_mapping = tuple(self.index_mapping)
-        self.prompt_mapping = tuple(self.prompt_mapping)
-
-
-def apply_prompt_adapter(instance, hidden_states: torch.Tensor,
-                         positions: torch.Tensor) -> torch.Tensor:
-    if hasattr(instance, 'prefix_encoder'):
-        soft_prompt = instance.prefix_encoder.prompt_embedding
-        indices = (positions < soft_prompt.shape[0])
-        hidden_states[indices] = soft_prompt[positions[indices]]
-    return hidden_states
+class PromptAdapterMapping(AdapterMapping):
+    pass
+
+
+class VocabParallelEmbeddingWithPromptAdapter(nn.Module):
+
+    def __init__(self, base_layer: VocabParallelEmbedding) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.embedding_tensors: Dict[int, torch.Tensor] = {}
+        self.indices: torch.Tensor
+
+    def reset_prompt_adapter(self, index: int):
+        self.embedding_tensors[index] = 0
+
+    def set_prompt_adapter(
+        self,
+        index: int,
+        embeddings_tensor: Optional[torch.Tensor],
+    ):
+        self.reset_prompt_adapter(index)
+        if embeddings_tensor is not None:
+            self.embedding_tensors[index] = embeddings_tensor
+
+    def set_mapping(
+        self,
+        base_indices: List[int],
+    ):
+        self.indices = base_indices
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.base_layer(x)
+        unique_indices = numpy.unique(self.indices)
+        for idx in unique_indices:
+            if idx != 0:
+                pa_idx = self.embedding_tensors[idx].prompt_embedding
+                mask = (self.indices == idx)
+                try:
+                    n_adapters = sum(mask) // pa_idx.shape[0]
+                    hidden_states[mask] = pa_idx.repeat(n_adapters, 1)
+                except Exception:
+                    pass
+        return hidden_states
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
@@ -1,15 +1,15 @@
 import logging
 import math
-from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Callable, Dict, List, Optional, Type
 
-import torch
 from peft.utils import load_peft_weights
 from torch import nn
 
 from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
                                          AdapterModelManager)
 from vllm.config import PromptAdapterConfig
-from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.layers import (
+    PromptAdapterMapping, VocabParallelEmbeddingWithPromptAdapter)
 
 logger = logging.getLogger(__name__)
 
@@ -22,69 +22,6 @@ def get_prompt_adapter_id():
     return _GLOBAL_PROMPT_ADAPTER_ID
 
 
-def convert_mapping(
-    mapping: PromptAdapterMapping,
-    prompt_adapter_index_to_id: List[Optional[int]], max_prompt_adapters: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
-    """Converts PromptAdapterMapping to index tensors.
-
-    Args:
-        mapping: PromptAdapterMapping mapping rows in a batch to ids.
-        prompt_adapter_index_to_id: List mapping PromptAdapter ids to indices.
-        max_prompt_adapters: Maximum number of PromptAdapters.
-    Returns:
-        A tuple of tensors:
-            base_indices: Tensor of shape [batch_size] mapping batch rows to
-                PromptAdapter indices.
-            sampler_indices: Tensor of shape [batch_size] mapping requests to
-                PromptAdapter indices for sampler. For generation, this will be
-                same as base_indicies. For prefill, this will map requests
-                to PromptAdapter indices.
-            sampler_indices_padded: Tensor of shape [batch_size] mapping
-                requests to PromptAdapter indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
-                max_promt_adapters.
-            indices_len: List of lengths of the above tensors.
-                Used to index into each tensor. It contains length for
-                (base_indices, sampler_indices, sampler_indices_padded).
-    """
-    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
-    prompt_adapter_indices = index_mapping_indices.copy()
-    prompt_mapping: List[int] = [
-        prompt_adapter_index_to_id.index(x) if x > 0 else -1
-        for x in mapping.prompt_mapping
-    ]
-    prompt_adapter_idx = None
-    for i in range(len(index_mapping_indices)):
-        # TODO index can be slow. optimize
-        prompt_adapter_idx = (prompt_adapter_index_to_id.index(
-            index_mapping_indices[i]) if index_mapping_indices[i] > 0 else -1)
-        prompt_adapter_indices[i] = prompt_adapter_idx
-
-    indices_list: List[Union[List[int], torch.Tensor]] = [
-        index_mapping_indices, prompt_adapter_indices
-    ]
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
-    prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
-                                         dtype=torch.long)
-    base_indices = indices[1]
-    sampler_indices = prompt_mapping_tensor
-    sampler_indices_padded = sampler_indices.clone()
-    sampler_indices_padded[sampler_indices_padded ==
-                           -1] = max_prompt_adapters - 1
-    sampler_indices_padded = (
-        torch.arange(
-            0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
-        (sampler_indices_padded * len(sampler_indices_padded)))
-    # Contain length of indices tensors. Used to index into each tensor.
-    indices_len = [
-        base_indices.shape[-1], sampler_indices.shape[-1],
-        sampler_indices_padded.shape[-1]
-    ]
-    return (base_indices, sampler_indices, sampler_indices_padded, indices_len)
-
-
 class PromptAdapterModel(AdapterModel):
 
     def __init__(self,
@@ -133,16 +70,9 @@ def __init__(
         self.model.prompt_adapter_manager = self
         self.adapter_type = 'PromptAdapter'
 
-        self.base_indices = torch.empty(self.max_num_batched_tokens,
-                                        dtype=torch.long,
-                                        device="cuda")
-        self.sampler_indices = torch.empty(self.max_num_batched_tokens,
-                                           dtype=torch.long,
-                                           device="cuda")
-        self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens,
-                                                  dtype=torch.long,
-                                                  device="cuda")
-        self.indices_len: List[Optional[int]] = [None] * 3
+        self.base_indices = [0]
+        self.modules: Dict[str, nn.Module] = {}
+        self._create_prompt_adapter_modules()
         self._last_mapping: Optional[PromptAdapterMapping] = None
 
     @property
@@ -157,15 +87,6 @@ def adapter_slots(self) -> int:
     def capacity(self) -> int:
         return self.prompt_adapter_config.max_cpu_prompt_adapters
 
-    def reset_adapter(self):
-        try:
-            self.remove_all_prompt_adapters()
-            for module_name, module in self.model.named_modules():
-                if 'Model' in (module.__class__.__name__):
-                    del module.prefix_encoder
-        except Exception:
-            pass
-
     def activate_prompt_adapter(
         self,
         prompt_adapter_id: int,
@@ -187,10 +108,8 @@ def activate_prompt_adapter(
         logger.debug("Activating prompt_adapter. int id: %d, slot index: %d",
                      prompt_adapter_model.id, index)
         self.prompt_adapter_index_to_id[index] = prompt_adapter_model.id
-        for module_name, module in self.model.named_modules():
-            if 'Model' in (module.__class__.__name__):
-                module.prefix_encoder = prompt_adapter_model
-                break
+        for _, v in self.modules.items():
+            v.set_prompt_adapter(prompt_adapter_id, prompt_adapter_model)
         return True
 
     @property
@@ -201,9 +120,8 @@ def _deactivate_prompt_adapter(self, prompt_adapter_id: int):
         try:
             index = self.prompt_adapter_index_to_id.index(prompt_adapter_id)
             self.prompt_adapter_index_to_id[index] = None
-            for module_name, module in self.model.named_modules():
-                if 'Model' in (module.__class__.__name__):
-                    del module.prefix_encoder
+            for _, v in self.modules.items():
+                v.reset_prompt_adapter(prompt_adapter_id)
         except ValueError:
             pass
 
@@ -232,16 +150,30 @@ def remove_prompt_adapter(self):
 
     def _set_prompt_adapter_mapping(self,
                                     mapping: PromptAdapterMapping) -> None:
-        (base_indices, sampler_indices, sampler_indices_padded,
-         indices_len) = convert_mapping(mapping,
-                                        self.prompt_adapter_index_to_id,
-                                        self.prompt_adapter_slots + 1)
-        self.base_indices[:base_indices.shape[0]].copy_(base_indices)
-        self.sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self.sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        # Maintain the reference
-        self.indices_len[:] = indices_len
+        for k, v in self.modules.items():
+            v.set_mapping(mapping.index_mapping)
+
+    def _create_prompt_adapter_modules(self):
+        for module_name, module in self.model.named_modules(
+                remove_duplicate=False):
+            if "VocabParallel" in module.__class__.__name__:
+                new_module = VocabParallelEmbeddingWithPromptAdapter(module)
+                replaced_module = self.replace_submodule(
+                    self.model, module_name, new_module)
+                self.register_module(module.__class__.__name__,
+                                     replaced_module)
+                replaced_module.set_mapping(self.base_indices)
+
+    def replace_submodule(self, model: nn.Module, module_name: str,
+                          new_module: nn.Module) -> nn.Module:
+        """Replace a submodule in a model with a new module."""
+        parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+        target_name = module_name.split(".")[-1]
+        setattr(parent, target_name, new_module)
+        return new_module
+
+    def register_module(self, module_name: str, module: nn.Module):
+        self.modules[module_name] = module
 
     @property
     def set_prompt_adapter_mapping(self):

diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py
@@ -44,9 +44,6 @@ def __init__(
     def is_enabled(self) -> bool:
         return True
 
-    def reset_adapter(self):
-        self._prompt_adapter_manager.reset_adapter()
-
     def create_prompt_adapter_manager(
         self,
         model: torch.nn.Module,