From f6fa2613390f5ec294609c674a1790f91d18fbda Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 30 Nov 2024 01:31:37 +0000
Subject: [PATCH 01/22] Init

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 105 ++++++--------------------------------------
 vllm/lora/punica.py |  76 +++++++++++++++++++++++++++-----
 2 files changed, 79 insertions(+), 102 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 3701988ff692f..85a2402337131 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -67,63 +67,6 @@ def dec(*args, **kwargs):
     return dec
 
 
-def apply_bias(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    bias_stacked: torch.Tensor,
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:    (num_loras, output_dim)
-        indices:         (batch_size)
-        output:          (batch_size, output_dim)
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
-    bias_stacked = bias_stacked[indices]
-    bias_stacked[indices == -1] = 0
-    output += bias_stacked
-
-    return output.view_as(org_output)
-
-
-def apply_bias_packed_nslice(
-    indices: torch.Tensor,
-    output: torch.Tensor,
-    output_slices: Tuple[int, ...],
-    bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-):
-    """Applies bias to output
-
-    Input shapes:
-        bias_stacked:      3 element tuple of (num_loras, output_dim)
-        indices:           (batch_size)
-        output:            (batch_size, q_slice_size + 2*kv_slice_size)
-        output_slices:     n-1 element tuple of (slice_size...),
-                           where n is number of slices
-    """
-    org_output = output
-    output = output.view(-1, output.shape[-1])
-    indices = indices.view(-1)
-
-    offset_left = 0
-    for slice_idx, slice in enumerate(output_slices):
-        bias = bias_stacked[slice_idx]
-        if bias is not None:
-            bias = bias.view(-1, bias.shape[-1])
-            bias = bias[indices]
-            bias[indices == -1] = 0
-            output[:, offset_left:offset_left + slice] += bias
-
-        offset_left += slice
-
-    return output.view_as(org_output)
-
-
 @dataclass
 class LoRAMapping(AdapterMapping):
     is_prefill: bool = False
@@ -401,13 +344,10 @@ def apply(self, x: torch.Tensor,
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
         if self.bias_stacked is not None:
             self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
+
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
+                                     self.lora_b_stacked, self.bias_stacked,
+                                     1.0)
         return output
 
     def forward(self, input_):
@@ -578,13 +518,10 @@ def apply(self, x: torch.Tensor,
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
         if self.bias_stacked is not None:
             self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
+
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
+                                     self.lora_b_stacked, self.bias_stacked,
+                                     1.0)
         return output
 
     def forward(self, input_):
@@ -774,15 +711,10 @@ def apply(self, x: torch.Tensor,
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
         if self.bias_stacked is not None:
             self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                (self.output_dim, self.output_dim),
-                self.bias_stacked,
-            )
+
         self.punica_wrapper.add_lora_packed_nslice(
-            output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
-            (self.output_dim, self.output_dim))
+            output, x, self.lora_a_stacked, self.lora_b_stacked,
+            self.bias_stacked, 1.0, (self.output_dim, self.output_dim))
         return output
 
     @classmethod
@@ -1131,15 +1063,10 @@ def apply(self, x: torch.Tensor,
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
         if self.bias_stacked is not None:
             self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias_packed_nslice(
-                self.indices,
-                output,
-                self.output_slices,
-                self.bias_stacked,
-            )
         self.punica_wrapper.add_lora_packed_nslice(output, x,
                                                    self.lora_a_stacked,
-                                                   self.lora_b_stacked, 1.0,
+                                                   self.lora_b_stacked,
+                                                   self.bias_stacked, 1.0,
                                                    self.output_slices)
         return output
 
@@ -1264,15 +1191,9 @@ def set_lora(
 
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-            output = apply_bias(
-                self.indices,
-                output,
-                self.bias_stacked,
-            )
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, 1.0)
+                                     self.lora_b_stacked, self.bias_stacked,
+                                     1.0)
         return output
 
     def forward(self, input_):
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 082041f390750..dcb69c231773a 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -450,6 +450,63 @@ def expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_input)
 
+    def add_bias(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        bias_stacked: torch.Tensor,
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            bias_stacked:    (num_loras, output_dim)
+            indices:         (batch_size)
+            output:          (batch_size, output_dim)
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
+        bias_stacked = bias_stacked[indices]
+        bias_stacked[indices == -1] = 0
+        output += bias_stacked
+
+        return output.view_as(org_output)
+
+    def add_bias_packed_nslice(
+        self,
+        indices: torch.Tensor,
+        output: torch.Tensor,
+        output_slices: Tuple[int, ...],
+        bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+    ):
+        """Applies bias to output
+
+        Input shapes:
+            bias_stacked:      3 element tuple of (num_loras, output_dim)
+            indices:           (batch_size)
+            output:            (batch_size, q_slice_size + 2*kv_slice_size)
+            output_slices:     n-1 element tuple of (slice_size...),
+                            where n is number of slices
+        """
+        org_output = output
+        output = output.view(-1, output.shape[-1])
+        indices = indices.view(-1)
+
+        offset_left = 0
+        for slice_idx, slice in enumerate(output_slices):
+            bias = bias_stacked[slice_idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[indices]
+                bias[indices == -1] = 0
+                output[:, offset_left:offset_left + slice] += bias
+
+            offset_left += slice
+
+        return output.view_as(org_output)
+
     def add_shrink(
         self,
         y: torch.Tensor,
@@ -499,7 +556,6 @@ def add_expand_slice(self,
         """
         Similar to `add_expand`
         """
-
         expand_slice_fun: Callable = (self.expand_slice_prefill
                                       if self.is_prefill else
                                       self.expand_slice_decode)
@@ -510,6 +566,7 @@ def add_lora(self,
                  x: torch.Tensor,
                  wa_t_all: torch.Tensor,
                  wb_t_all: torch.Tensor,
+                 bias_all: Optional[torch.Tensor],
                  scale: float,
                  y_offset: Optional[int] = None,
                  y_slice_size: Optional[int] = None,
@@ -544,7 +601,8 @@ def add_lora(self,
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-
+        if bias_all is not None:
+            x = self.add_bias(self.token_lora_indices, x, bias_all)
         self.add_shrink(buffer, x, wa_t_all, scale)
         if y_offset is None and y_slice_size is None:
             self.add_expand(y, buffer, wb_t_all, add_input=True)
@@ -558,13 +616,10 @@ def add_lora(self,
         y = y.view_as(y_org)
 
     def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                               lora_a_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               lora_b_stacked: Tuple[torch.Tensor,
-                                                     torch.Tensor,
-                                                     torch.Tensor],
-                               scale: float,
+                               lora_a_stacked: Tuple[torch.Tensor, ...],
+                               lora_b_stacked: Tuple[torch.Tensor, ...],
+                               bias_all: Tuple[Optional[torch.Tensor],
+                                               ...], scale: float,
                                output_slices: Tuple[int, ...]) -> None:
         """
         Applies lora to each input. Similar to add_lora, This method is 
@@ -577,8 +632,9 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
         offset_left = 0
         # TODO fuse these kernels
         for slice_idx in range(len(output_slices)):
+            bias = bias_all[slice_idx] if bias_all is not None else None
             self.add_lora(y, x, lora_a_stacked[slice_idx],
-                          lora_b_stacked[slice_idx], scale, offset_left,
+                          lora_b_stacked[slice_idx], bias, scale, offset_left,
                           output_slices[slice_idx])
             offset_left += output_slices[slice_idx]
 

From aff0182e0b614f0bc6e1ab479bacd85f505a8a02 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 2 Dec 2024 07:20:25 +0000
Subject: [PATCH 02/22] Done 1/2

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 11 -----------
 vllm/lora/punica.py | 10 ++++++----
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 85a2402337131..5441b6c9336c3 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -342,9 +342,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
                                      self.lora_b_stacked, self.bias_stacked,
                                      1.0)
@@ -516,9 +513,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
                                      self.lora_b_stacked, self.bias_stacked,
                                      1.0)
@@ -709,9 +703,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
-
         self.punica_wrapper.add_lora_packed_nslice(
             output, x, self.lora_a_stacked, self.lora_b_stacked,
             self.bias_stacked, 1.0, (self.output_dim, self.output_dim))
@@ -1061,8 +1052,6 @@ def set_lora(
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        if self.bias_stacked is not None:
-            self.indices = self.punica_wrapper.token_lora_indices
         self.punica_wrapper.add_lora_packed_nslice(output, x,
                                                    self.lora_a_stacked,
                                                    self.lora_b_stacked,
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index dcb69c231773a..6414dd49be719 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -479,7 +479,7 @@ def add_bias_packed_nslice(
         indices: torch.Tensor,
         output: torch.Tensor,
         output_slices: Tuple[int, ...],
-        bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        bias_stacked: Tuple[Optional[torch.Tensor], ...],
     ):
         """Applies bias to output
 
@@ -602,7 +602,7 @@ def add_lora(self,
                                  dtype=torch.float32,
                                  device=x.device)
         if bias_all is not None:
-            x = self.add_bias(self.token_lora_indices, x, bias_all)
+            y = self.add_bias(self.token_lora_indices, y, bias_all)
         self.add_shrink(buffer, x, wa_t_all, scale)
         if y_offset is None and y_slice_size is None:
             self.add_expand(y, buffer, wb_t_all, add_input=True)
@@ -630,11 +630,13 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
         x = x.view(-1, x.shape[-1])
         y = y.view(-1, y.shape[-1])
         offset_left = 0
+        if bias_all is not None:
+            y = self.add_bias_packed_nslice(self.token_lora_indices, y,
+                                            output_slices, bias_all)
         # TODO fuse these kernels
         for slice_idx in range(len(output_slices)):
-            bias = bias_all[slice_idx] if bias_all is not None else None
             self.add_lora(y, x, lora_a_stacked[slice_idx],
-                          lora_b_stacked[slice_idx], bias, scale, offset_left,
+                          lora_b_stacked[slice_idx], None, scale, offset_left,
                           output_slices[slice_idx])
             offset_left += output_slices[slice_idx]
 

From 20f8018d0b8ccf948269829fd8cd5421faed1084 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 2 Dec 2024 10:10:17 +0000
Subject: [PATCH 03/22] Done

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py | 41 +++++++++-------------------
 vllm/lora/layers.py               |  3 ++-
 vllm/lora/punica.py               | 45 +++++++++++++++++++++++++++----
 3 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index f5c2eced9d2bb..5f2d32defe030 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -73,6 +73,7 @@ def apply(self, x: torch.Tensor,
         self.punica_wrapper.add_expand(output,
                                        buffer,
                                        self.lora_b_stacked,
+                                       self.bias_stacked,
                                        add_input=True)
         # now have column partitioned output
 
@@ -131,27 +132,14 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
                                         layer.lora_a_stacked[idx], 1.0)
 
     buffers = tensor_model_parallel_all_gather(buffers)
-    left_offset = 0
-    for idx in range(n):
-        shard_size = layer.lora_b_stacked[idx].shape[2]
-
-        if layer.bias_stacked is not None:
-            bias = layer.bias_stacked[idx]
-            if bias is not None:
-                bias = bias.view(-1, bias.shape[-1])
-                bias = bias[layer.punica_wrapper.token_lora_indices]
-                bias[layer.punica_wrapper.token_lora_indices == -1] = 0
-                output[:, left_offset:left_offset + shard_size] += bias
-
-        layer.punica_wrapper.add_expand_slice(
-            output,
-            buffers[idx],
-            layer.lora_b_stacked[idx],
-            left_offset,
-            shard_size,
-            add_input=True,
-        )
-        left_offset += shard_size
+    layer.punica_wrapper.add_expand_packed_nslice(
+        output,
+        buffers,
+        layer.lora_b_stacked,
+        layer.bias_stacked,
+        1.0,
+        layer.output_slices,
+    )
 
     output = output.view(*out_orig_shape)
     # now have column partitioned and packed output
@@ -234,6 +222,7 @@ def apply(self, x: torch.Tensor,
         self.punica_wrapper.add_expand(output,
                                        buffer,
                                        self.lora_b_stacked,
+                                       self.bias_all,
                                        add_input=True)
         # now have column partitioned output
         output = output.view(*out_orig_shape)
@@ -350,15 +339,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
-
-        if self.bias_stacked is not None:
-            bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1])
-            bias = bias[self.punica_wrapper.token_lora_indices]
-            bias[self.punica_wrapper.token_lora_indices == -1] = 0
-            output += bias
-
         self.punica_wrapper.add_expand_slice(output, buffer,
-                                             self.lora_b_stacked, start_idx,
+                                             self.lora_b_stacked,
+                                             self.bias_stacked, start_idx,
                                              shard_size)
         output = output.view(*out_orig_shape)
         return output
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 5441b6c9336c3..73748b5ce511e 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -254,6 +254,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         self.punica_wrapper.add_expand(full_output,
                                        full_lora_a_embeddings,
                                        self.lora_b_stacked,
+                                       bias_all=None,
                                        add_input=True)
         return full_output.view_as(full_output_org)
 
@@ -618,8 +619,8 @@ def create_lora_weights(
                 ) for _ in range(n_slices))
         else:
             self.bias_stacked = None
-
         self.output_dim = self.lora_b_stacked[0].shape[2]
+        self.output_slices = (self.output_dim, self.output_dim)
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[0][index] = 0
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 6414dd49be719..8808ba94977cb 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -502,7 +502,6 @@ def add_bias_packed_nslice(
                 bias = bias[indices]
                 bias[indices == -1] = 0
                 output[:, offset_left:offset_left + slice] += bias
-
             offset_left += slice
 
         return output.view_as(org_output)
@@ -531,17 +530,20 @@ def add_expand(
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
+        bias_all: Optional[torch.Tensor],
         add_input: bool = True,
     ):
         """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the
         GEMM of lora'b.
         When `is_prefill` is true, it indicates that it is currently the
         prefill stage, and the `expand_prefill` function should be called.
         Otherwise, it is the decode stage, and the expand_decode function
         should be called.
         """
-
+        if bias_all is not None:
+            y = self.add_bias(self.token_lora_indices, y, bias_all)
+        
         expand_fun: Callable = (self.expand_prefill
                                 if self.is_prefill else self.expand_decode)
         expand_fun(y, x, w_t_all, add_input)
@@ -550,17 +552,48 @@ def add_expand_slice(self,
                          y: torch.Tensor,
                          x: torch.Tensor,
                          w_t_all: torch.Tensor,
+                         bias_all: Optional[torch.Tensor],
                          y_offset: Optional[int],
                          y_slice_size: Optional[int],
                          add_input: bool = True):
         """
         Similar to `add_expand`
         """
+        if bias_all is not None:
+            y = self.add_bias(self.token_lora_indices, y, bias_all)
+        
         expand_slice_fun: Callable = (self.expand_slice_prefill
                                       if self.is_prefill else
                                       self.expand_slice_decode)
         expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
 
+    def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                                 lora_b_stacked: Tuple[torch.Tensor, ...],
+                                 bias_stacked: Optional[Tuple[torch.Tensor,
+                                                              ...]],
+                                 scale: float,
+                                 output_slices: Tuple[int, ...]) -> None:
+        """
+        Similar to `add_expand`
+        """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+        if bias_stacked is not None:
+            self.add_bias_packed_nslice(self.token_lora_indices, y,
+                                        output_slices, bias_stacked)
+        for slice_idx in range(len(lora_b_stacked)):
+            self.add_expand_slice(y,
+                                  x[slice_idx],
+                                  lora_b_stacked[slice_idx],
+                                  None,
+                                  offset_left,
+                                  output_slices[slice_idx],
+                                  add_input=True)
+            offset_left += output_slices[slice_idx]
+
+        y = y.view_as(y_org)
+
     def add_lora(self,
                  y: torch.Tensor,
                  x: torch.Tensor,
@@ -579,12 +612,13 @@ def add_lora(self,
             @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
             @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
             * scale
-            ).squeeze(0)
+            ).squeeze(0)+bias[i]
         Args:
             y (torch.Tensor):  Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             wa_t_all (torch.Tensor): lora_a's weight
             wb_t_all (torch.Tensor): lora_b's weight
+            bias_all: (torch.Tensor): lora's bias
             scale (float): Scaling factor.
             y_offset (Optional[int], optional): Offset to apply to the starting
                 column of y.
@@ -605,11 +639,12 @@ def add_lora(self,
             y = self.add_bias(self.token_lora_indices, y, bias_all)
         self.add_shrink(buffer, x, wa_t_all, scale)
         if y_offset is None and y_slice_size is None:
-            self.add_expand(y, buffer, wb_t_all, add_input=True)
+            self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True)
         else:
             self.add_expand_slice(y,
                                   buffer,
                                   wb_t_all,
+                                  None,
                                   y_offset,
                                   y_slice_size,
                                   add_input=True)

From 0a5aa735d3be3f1253c6636ac4bada8345581b88 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 2 Dec 2024 10:34:17 +0000
Subject: [PATCH 04/22] Add lora bias test

---
 tests/lora/test_llama_tp.py | 60 +++++++++++++++++--------------------
 vllm/lora/punica.py         |  2 +-
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index aae6310a2a213..0b4bcb6554cbb 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -55,15 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-@fork_new_process_for_each_test
-def test_llama_lora(sql_lora_files):
-
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   tensor_parallel_size=1)
-
+def generate_and_test(llm,sql_lora_files):
     print("lora adapter created")
     assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
@@ -79,6 +71,18 @@ def test_llama_lora(sql_lora_files):
     print("removing lora")
 
 
+@fork_new_process_for_each_test
+def test_llama_lora(sql_lora_files):
+
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=4,
+                   tensor_parallel_size=1)
+    generate_and_test(llm,sql_lora_files)
+
+
+
 @fork_new_process_for_each_test
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
@@ -118,20 +122,7 @@ def test_llama_lora_tp4(sql_lora_files):
         max_loras=4,
         tensor_parallel_size=4,
     )
-
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
-
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
-
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
-
-    print("removing lora")
+    generate_and_test(llm,sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
@@ -146,16 +137,21 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    generate_and_test(llm,sql_lora_files)
 
-    print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
 
-    print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
-    print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
 
-    print("removing lora")
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=4,
+        fully_sharded_loras=True,
+        enable_lora_bias=True,
+    )
+    generate_and_test(llm,sql_lora_files)
\ No newline at end of file
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 8808ba94977cb..9b05b044a815e 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -591,7 +591,7 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
                                   output_slices[slice_idx],
                                   add_input=True)
             offset_left += output_slices[slice_idx]
-
+        
         y = y.view_as(y_org)
 
     def add_lora(self,

From 6805805b0c55f636daf397da692456ddde56d05a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 2 Dec 2024 15:38:18 +0000
Subject: [PATCH 05/22] Init

---
 tests/lora/test_llama_tp.py       | 12 ++---
 vllm/lora/fully_sharded_layers.py | 25 +++++----
 vllm/lora/punica.py               | 88 ++++++++++++++++---------------
 3 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 0b4bcb6554cbb..d3ca7f878191a 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -55,7 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
-def generate_and_test(llm,sql_lora_files):
+def generate_and_test(llm, sql_lora_files):
     print("lora adapter created")
     assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
@@ -79,8 +79,7 @@ def test_llama_lora(sql_lora_files):
                    max_num_seqs=16,
                    max_loras=4,
                    tensor_parallel_size=1)
-    generate_and_test(llm,sql_lora_files)
-
+    generate_and_test(llm, sql_lora_files)
 
 
 @fork_new_process_for_each_test
@@ -122,7 +121,7 @@ def test_llama_lora_tp4(sql_lora_files):
         max_loras=4,
         tensor_parallel_size=4,
     )
-    generate_and_test(llm,sql_lora_files)
+    generate_and_test(llm, sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
@@ -137,8 +136,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    generate_and_test(llm,sql_lora_files)
-
+    generate_and_test(llm, sql_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
@@ -154,4 +152,4 @@ def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
         fully_sharded_loras=True,
         enable_lora_bias=True,
     )
-    generate_and_test(llm,sql_lora_files)
\ No newline at end of file
+    generate_and_test(llm, sql_lora_files)
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 5f2d32defe030..143319afa94bc 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -76,14 +76,6 @@ def apply(self, x: torch.Tensor,
                                        self.bias_stacked,
                                        add_input=True)
         # now have column partitioned output
-
-        if self.bias_stacked is not None:
-            self.bias_stacked = self.bias_stacked.view(
-                -1, self.bias_stacked.shape[-1])
-            self.bias_stacked = self.bias_stacked[
-                self.punica_wrapper.token_lora_indices]
-            output += self.bias_stacked
-
         output = output.view(*out_orig_shape)
         return output
 
@@ -338,11 +330,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
-        start_idx = self.tp_rank * shard_size
-        self.punica_wrapper.add_expand_slice(output, buffer,
-                                             self.lora_b_stacked,
-                                             self.bias_stacked, start_idx,
-                                             shard_size)
+
+        # To be compatible with the input of the add_expand_packed_nslice,
+        # there is only one slice.
+        buffer = buffer.unsqueeze(dim=0)
+        self.punica_wrapper.add_expand_packed_nslice(
+            output,
+            buffer,
+            (self.lora_b_stacked, ),
+            (self.bias_stacked, ) if self.bias_stacked is not None else None,
+            1.0,
+            (shard_size, ),
+        )
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 9b05b044a815e..6b071e88540ca 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -450,7 +450,28 @@ def expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_input)
 
-    def add_bias(
+    def apply_expand_slice(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           w_t_all: torch.Tensor,
+                           bias_all: Optional[torch.Tensor],
+                           y_offset: Optional[int],
+                           y_slice_size: Optional[int],
+                           add_input: bool = True):
+        """
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all+bias` 
+        computation, which is suitable for the
+        GEMM of lora'b.
+        """
+        if bias_all is not None:
+            y = self.apply_bias(self.token_lora_indices, y, bias_all)
+
+        expand_slice_fun: Callable = (self.expand_slice_prefill
+                                      if self.is_prefill else
+                                      self.expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def apply_bias(
         self,
         indices: torch.Tensor,
         output: torch.Tensor,
@@ -474,7 +495,7 @@ def add_bias(
 
         return output.view_as(org_output)
 
-    def add_bias_packed_nslice(
+    def apply_bias_packed_nslice(
         self,
         indices: torch.Tensor,
         output: torch.Tensor,
@@ -542,31 +563,12 @@ def add_expand(
         should be called.
         """
         if bias_all is not None:
-            y = self.add_bias(self.token_lora_indices, y, bias_all)
-        
+            y = self.apply_bias(self.token_lora_indices, y, bias_all)
+
         expand_fun: Callable = (self.expand_prefill
                                 if self.is_prefill else self.expand_decode)
         expand_fun(y, x, w_t_all, add_input)
 
-    def add_expand_slice(self,
-                         y: torch.Tensor,
-                         x: torch.Tensor,
-                         w_t_all: torch.Tensor,
-                         bias_all: Optional[torch.Tensor],
-                         y_offset: Optional[int],
-                         y_slice_size: Optional[int],
-                         add_input: bool = True):
-        """
-        Similar to `add_expand`
-        """
-        if bias_all is not None:
-            y = self.add_bias(self.token_lora_indices, y, bias_all)
-        
-        expand_slice_fun: Callable = (self.expand_slice_prefill
-                                      if self.is_prefill else
-                                      self.expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
-
     def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
                                  lora_b_stacked: Tuple[torch.Tensor, ...],
                                  bias_stacked: Optional[Tuple[torch.Tensor,
@@ -580,18 +582,18 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
         y = y.view(-1, y.shape[-1])
         offset_left = 0
         if bias_stacked is not None:
-            self.add_bias_packed_nslice(self.token_lora_indices, y,
-                                        output_slices, bias_stacked)
+            self.apply_bias_packed_nslice(self.token_lora_indices, y,
+                                          output_slices, bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            self.add_expand_slice(y,
-                                  x[slice_idx],
-                                  lora_b_stacked[slice_idx],
-                                  None,
-                                  offset_left,
-                                  output_slices[slice_idx],
-                                  add_input=True)
+            self.apply_expand_slice(y,
+                                    x[slice_idx],
+                                    lora_b_stacked[slice_idx],
+                                    None,
+                                    offset_left,
+                                    output_slices[slice_idx],
+                                    add_input=True)
             offset_left += output_slices[slice_idx]
-        
+
         y = y.view_as(y_org)
 
     def add_lora(self,
@@ -636,18 +638,18 @@ def add_lora(self,
                                  dtype=torch.float32,
                                  device=x.device)
         if bias_all is not None:
-            y = self.add_bias(self.token_lora_indices, y, bias_all)
+            y = self.apply_bias(self.token_lora_indices, y, bias_all)
         self.add_shrink(buffer, x, wa_t_all, scale)
         if y_offset is None and y_slice_size is None:
             self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True)
         else:
-            self.add_expand_slice(y,
-                                  buffer,
-                                  wb_t_all,
-                                  None,
-                                  y_offset,
-                                  y_slice_size,
-                                  add_input=True)
+            self.apply_expand_slice(y,
+                                    buffer,
+                                    wb_t_all,
+                                    None,
+                                    y_offset,
+                                    y_slice_size,
+                                    add_input=True)
         y = y.view_as(y_org)
 
     def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
@@ -666,8 +668,8 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
         y = y.view(-1, y.shape[-1])
         offset_left = 0
         if bias_all is not None:
-            y = self.add_bias_packed_nslice(self.token_lora_indices, y,
-                                            output_slices, bias_all)
+            y = self.apply_bias_packed_nslice(self.token_lora_indices, y,
+                                              output_slices, bias_all)
         # TODO fuse these kernels
         for slice_idx in range(len(output_slices)):
             self.add_lora(y, x, lora_a_stacked[slice_idx],

From c5c4598e3fec1c717bad3660e62a3afe9b9970bd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 3 Dec 2024 15:20:36 +0000
Subject: [PATCH 06/22] Modify layers backup

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 864 ++++++++++++++++++++++++++------------------
 vllm/lora/models.py |   8 +-
 vllm/lora/punica.py |   5 +-
 3 files changed, 513 insertions(+), 364 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 73748b5ce511e..f46a9470f61b7 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1,7 +1,7 @@
 # pylint: disable=unused-argument
 import math
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -19,6 +19,7 @@
 from vllm.distributed.utils import divide
 from vllm.lora.punica import PunicaWrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
                                                MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
@@ -135,6 +136,7 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         self.base_layer = base_layer
         self.embeddings_slice: Optional[Tuple[int, int]]
         self.embeddings_weights: Optional[torch.Tensor]
+        self.n_slices = 1
 
     def create_lora_weights(
             self,
@@ -168,34 +170,36 @@ def create_lora_weights(
             dtype=self.base_layer.weight.dtype,
             device=self.base_layer.weight.device,
         )
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.org_vocab_size +
-                lora_config.lora_extra_vocab_size,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.base_layer.weight.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                self.base_layer.embedding_dim,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.base_layer.weight.device,
-        )
-        self.lora_a_stacked_2d = self.lora_a_stacked.view(
-            self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
-            self.lora_a_stacked.shape[2],
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.org_vocab_size +
+                    lora_config.lora_extra_vocab_size,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.base_layer.weight.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    1,
+                    self.base_layer.embedding_dim,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.base_layer.weight.device,
+            ) for _ in range(self.n_slices))
+        self.lora_a_stacked_2d = self.lora_a_stacked[0].view(
+            self.lora_a_stacked[0].shape[0] * self.lora_a_stacked[0].shape[1],
+            self.lora_a_stacked[0].shape[2],
         )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
+        self.lora_a_stacked[0][index] = 0
+        self.lora_b_stacked[0][index] = 0
         self.embeddings_tensors[index] = 0
 
     def set_lora(
@@ -207,11 +211,12 @@ def set_lora(
         bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
-        self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
-            lora_a, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
+        self.lora_a_stacked[0][
+            index, :lora_a.shape[0], :lora_a.shape[1]].copy_(lora_a,
+                                                             non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
@@ -253,7 +258,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Embedding layer only need expand op
         self.punica_wrapper.add_expand(full_output,
                                        full_lora_a_embeddings,
-                                       self.lora_b_stacked,
+                                       self.lora_b_stacked[0],
                                        bias_all=None,
                                        add_input=True)
         return full_output.view_as(full_output_org)
@@ -269,14 +274,15 @@ def can_replace_layer(
         return type(source_layer) is VocabParallelEmbedding
 
 
-class ReplicatedLinearWithLoRA(BaseLayerWithLoRA):
+class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
-    def __init__(self, base_layer: ReplicatedLinear) -> None:
+    def __init__(self, base_layer: LinearBase):
         super().__init__()
         self.base_layer = base_layer
         self.input_size = self.base_layer.input_size
-        self.output_size = self.base_layer.output_size
         self.device = _get_lora_device(self.base_layer)
+        self.output_slices: Tuple[int, ...]
+        self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
 
     def create_lora_weights(
         self,
@@ -285,39 +291,46 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-        lora_a_output_size = lora_config.max_lora_rank
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
+        lora_a_output_size_per_partition = (
+            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+            else divide(lora_config.max_lora_rank, self.tp_size))
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                max_loras,
+                1,
+                lora_a_output_size_per_partition,
+                self.input_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
                 max_loras,
                 1,
                 self.output_size,
+                lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            )
-        else:
-            self.bias_stacked = None
+            ) for _ in range(self.n_slices))
+        if lora_config.bias_enabled:
+            self.bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    self.output_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(self.n_slices))
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        for s_index in range(self.n_slices):
+            self.lora_a_stacked[s_index][index] = 0
+            self.lora_b_stacked[s_index][index] = 0
+            if self.lora_config.bias_enabled:
+                self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                         self.bias_stacked)
+                self.bias_stacked[s_index][index] = 0
 
     def set_lora(
         self,
@@ -329,25 +342,126 @@ def set_lora(
     ):
         self.reset_lora(index)
 
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
+        if self.tp_size > 1:
+            lora_a = self.slice_lora_a(lora_a)
+            lora_b = self.slice_lora_b(lora_b)
+            if bias is not None:
+                bias = self.slice_bias(bias)
+
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
         if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
+            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                     self.bias_stacked)
+            self.bias_stacked[0][index,
+                                 0, :bias.shape[0]].copy_(bias.T,
+                                                          non_blocking=True)
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, self.bias_stacked,
-                                     1.0)
+        self.punica_wrapper.add_lora_packed_nslice(output, x,
+                                                   self.lora_a_stacked,
+                                                   self.lora_b_stacked,
+                                                   self.bias_stacked, 1.0,
+                                                   self.output_slices)
         return output
 
+
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(base_layer, )
+        self.tp_size = 1  #To ensure interface compatibility, it is set to 1
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+
+    # def create_lora_weights(
+    #     self,
+    #     max_loras: int,
+    #     lora_config: LoRAConfig,
+    #     model_config: Optional[PretrainedConfig] = None,
+    # ) -> None:
+    #     self.lora_config = lora_config
+    #     lora_a_output_size_per_partition = (
+    #         lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+    #         else divide(lora_config.max_lora_rank, self.tp_size))
+    #     self.lora_a_stacked = tuple(
+    #         torch.zeros(
+    #             max_loras,
+    #             1,
+    #             lora_a_output_size_per_partition,
+    #             self.input_size,
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+    #     self.lora_b_stacked = tuple(
+    #         torch.zeros(
+    #             max_loras,
+    #             1,
+    #             self.output_size,
+    #             lora_config.max_lora_rank,
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+    #     if lora_config.bias_enabled:
+    #         self.bias_stacked = tuple(
+    #             torch.zeros(
+    #                 max_loras,
+    #                 1,
+    #                 self.output_size,
+    #                 dtype=lora_config.lora_dtype,
+    #                 device=self.device,
+    #             ) for _ in range(self.n_slices))
+    #     self.output_slices = (self.lora_b_stacked[0].shape[2], )
+
+    # def reset_lora(self, index: int):
+    #     for s_index in range(self.n_slices):
+    #         self.lora_a_stacked[s_index][index] = 0
+    #         self.lora_b_stacked[s_index][index] = 0
+    #         if self.lora_config.bias_enabled:
+    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                      self.bias_stacked)
+    #             self.bias_stacked[s_index][index] = 0
+
+    # def set_lora(
+    #     self,
+    #     index: int,
+    #     lora_a: torch.Tensor,
+    #     lora_b: torch.Tensor,
+    #     embeddings_tensor: Optional[torch.Tensor],
+    #     bias: Optional[torch.Tensor] = None,
+    # ):
+    #     self.reset_lora(index)
+
+    #     self.lora_a_stacked[0][index,
+    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+    #                                lora_a.T, non_blocking=True)
+    #     self.lora_b_stacked[0][index,
+    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+    #                                lora_b.T, non_blocking=True)
+    #     if bias is not None:
+    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                  self.bias_stacked)
+    #         self.bias_stacked[0][index,
+    #                              0, :bias.shape[0]].copy_(bias.T,
+    #                                                       non_blocking=True)
+
+    # def apply(self, x: torch.Tensor,
+    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
+    #                                                self.lora_a_stacked,
+    #                                                self.lora_b_stacked,
+    #                                                self.bias_stacked, 1.0,
+    #                                                self.output_slices)
+    #     return output
+
     def forward(self, input_):
         """Forward of ReplicatedLinearWithLoRA
 
@@ -380,7 +494,7 @@ def can_replace_layer(
         return type(source_layer) is ReplicatedLinear
 
 
-class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
+class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
 
@@ -388,65 +502,68 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
     """
 
     def __init__(self, base_layer: ColumnParallelLinear) -> None:
-        super().__init__()
+        super().__init__(base_layer)
         # The base_layer type is ColumnParallelLinear or
         # MergedColumnParallelLinear, their weight sharding logic is
         # inconsistent when TP is greater than 1.
         self.is_merged_col_linear = type(
             base_layer) is MergedColumnParallelLinear
-
-        self.base_layer = base_layer
         self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size = self.base_layer.input_size
         self.output_size = self.base_layer.output_size_per_partition
-        self.device = _get_lora_device(self.base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
-        self.tp_size = get_tensor_model_parallel_world_size()
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
-        self.lora_a_stacked = torch.zeros(
-            max_loras,
-            1,
-            lora_a_output_size_per_partition,
-            self.input_size,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            max_loras,
-            1,
-            self.output_size,
-            lora_config.max_lora_rank,
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                max_loras,
-                1,
-                self.output_size,
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-
-        self.output_dim = self.lora_b_stacked.shape[2]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        self.n_slices = 1
+        # self.output_slices: Tuple[int, ...]
+        # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
+        # self.n_slices = 1
+
+    # def create_lora_weights(
+    #     self,
+    #     max_loras: int,
+    #     lora_config: LoRAConfig,
+    #     model_config: Optional[PretrainedConfig] = None,
+    # ) -> None:
+    #     self.lora_config = lora_config
+    #     lora_a_output_size_per_partition = (
+    #         lora_config.max_lora_rank if not lora_config.fully_sharded_loras
+    #         else divide(lora_config.max_lora_rank, self.tp_size))
+    #     self.lora_a_stacked = tuple(
+    #         torch.zeros(
+    #             max_loras,
+    #             1,
+    #             lora_a_output_size_per_partition,
+    #             self.input_size,
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+    #     self.lora_b_stacked = tuple(
+    #         torch.zeros(
+    #             max_loras,
+    #             1,
+    #             self.output_size,
+    #             lora_config.max_lora_rank,
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+
+    #     if lora_config.bias_enabled:
+    #         self.bias_stacked = tuple(
+    #             torch.zeros(
+    #                 max_loras,
+    #                 1,
+    #                 self.output_size,
+    #                 dtype=lora_config.lora_dtype,
+    #                 device=self.device,
+    #             ) for _ in range(self.n_slices))
+    #     self.output_dim = self.lora_b_stacked[0].shape[2]
+    #     self.output_slices = (self.output_dim, )
+
+    # def reset_lora(self, index: int):
+    #     for s_index in range(self.n_slices):
+    #         self.lora_a_stacked[s_index][index] = 0
+    #         self.lora_b_stacked[s_index][index] = 0
+    #         if self.lora_config.bias_enabled:
+    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                      self.bias_stacked)
+    #             self.bias_stacked[s_index][index] = 0
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         return lora_a
@@ -485,39 +602,44 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, self.bias_stacked,
-                                     1.0)
-        return output
+    # def set_lora(
+    #     self,
+    #     index: int,
+    #     lora_a: torch.Tensor,
+    #     lora_b: torch.Tensor,
+    #     embeddings_tensor: Optional[torch.Tensor],
+    #     bias: Optional[torch.Tensor] = None,
+    # ):
+    #     self.reset_lora(index)
+
+    #     if self.tp_size > 1:
+    #         lora_a = self.slice_lora_a(lora_a)
+    #         lora_b = self.slice_lora_b(lora_b)
+    #         if bias is not None:
+    #             bias = self.slice_bias(bias)
+
+    #     self.lora_a_stacked[0][index,
+    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+    #                                lora_a.T, non_blocking=True)
+    #     self.lora_b_stacked[0][index,
+    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+    #                                lora_b.T, non_blocking=True)
+    #     if bias is not None:
+    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                  self.bias_stacked)
+    #         self.bias_stacked[0][index,
+    #                              0, :bias.shape[0]].copy_(bias.T,
+    #                                                       non_blocking=True)
+
+    # def apply(self, x: torch.Tensor,
+    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
+    #                                                self.lora_a_stacked,
+    #                                                self.lora_b_stacked,
+    #                                                self.bias_stacked, 1.0,
+    #                                                self.output_slices)
+    #     return output
 
     def forward(self, input_):
         """Forward of ColumnParallelLinear
@@ -568,6 +690,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
+        self.n_slices = 2
 
     def create_lora_weights(
         self,
@@ -576,8 +699,8 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-        n_slices = 2
-        if not (len(self.base_layer.output_sizes) == n_slices
+
+        if not (len(self.base_layer.output_sizes) == self.n_slices
                 and self.base_layer.output_sizes[0]
                 == self.base_layer.output_sizes[1]):
             raise ValueError(
@@ -598,7 +721,7 @@ def create_lora_weights(
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for _ in range(self.n_slices))
         self.lora_b_stacked = tuple(
             torch.zeros(
                 max_loras,
@@ -607,7 +730,7 @@ def create_lora_weights(
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
-            ) for _ in range(n_slices))
+            ) for _ in range(self.n_slices))
         if lora_config.bias_enabled:
             self.bias_stacked = tuple(
                 torch.zeros(
@@ -616,20 +739,18 @@ def create_lora_weights(
                     self.output_size // 2,
                     dtype=lora_config.lora_dtype,
                     device=self.device,
-                ) for _ in range(n_slices))
-        else:
-            self.bias_stacked = None
+                ) for _ in range(self.n_slices))
         self.output_dim = self.lora_b_stacked[0].shape[2]
         self.output_slices = (self.output_dim, self.output_dim)
 
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
+    # def reset_lora(self, index: int):
+    #     self.lora_a_stacked[0][index] = 0
+    #     self.lora_a_stacked[1][index] = 0
+    #     self.lora_b_stacked[0][index] = 0
+    #     self.lora_b_stacked[1][index] = 0
+    #     if self.lora_config.bias_enabled:
+    #         self.bias_stacked[0][index] = 0
+    #         self.bias_stacked[1][index] = 0
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -686,6 +807,8 @@ def set_lora(
                 index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
                     lora_b[0].T, non_blocking=True)
         if bias is not None and bias[0] is not None:
+            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                     self.bias_stacked)
             self.bias_stacked[0][index,
                                  0, :bias[0].shape[0]].copy_(bias[0].T,
                                                              non_blocking=True)
@@ -697,17 +820,19 @@ def set_lora(
                 index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
                     lora_b[1].T, non_blocking=True)
         if bias is not None and bias[1] is not None:
+            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                     self.bias_stacked)
             self.bias_stacked[1][index,
                                  0, :bias[1].shape[0]].copy_(bias[1].T,
                                                              non_blocking=True)
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora_packed_nslice(
-            output, x, self.lora_a_stacked, self.lora_b_stacked,
-            self.bias_stacked, 1.0, (self.output_dim, self.output_dim))
-        return output
+    # def apply(self, x: torch.Tensor,
+    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+    #     self.punica_wrapper.add_lora_packed_nslice(
+    #         output, x, self.lora_a_stacked, self.lora_b_stacked,
+    #         self.bias_stacked, 1.0, self.output_slices)
+    #     return output
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -746,6 +871,7 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
                                    self.base_layer.head_size)
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
+        self.n_slices = 1
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
@@ -780,31 +906,33 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-        if self.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
+    # def set_lora(
+    #     self,
+    #     index: int,
+    #     lora_a: torch.Tensor,
+    #     lora_b: torch.Tensor,
+    #     embeddings_tensor: Optional[torch.Tensor],
+    #     bias: Optional[torch.Tensor] = None,
+    # ):
+    #     self.reset_lora(index)
+    #     if self.tp_size > 1:
+    #         lora_a = self.slice_lora_a(lora_a)
+    #         lora_b = self.slice_lora_b(lora_b)
+    #         if bias is not None:
+    #             bias = self.slice_bias(bias)
+
+    #     self.lora_a_stacked[0][index,
+    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+    #                                lora_a.T, non_blocking=True)
+    #     self.lora_b_stacked[0][index,
+    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+    #                                lora_b.T, non_blocking=True)
+    #     if bias is not None:
+    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                  self.bias_stacked)
+    #         self.bias_stacked[0][index,
+    #                              0, :bias.shape[0]].copy_(bias.T,
+    #                                                       non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -828,6 +956,7 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
+        self.n_slices = 3
 
     def create_lora_weights(
         self,
@@ -925,9 +1054,6 @@ def create_lora_weights(
                     device=self.device,
                 ),
             )
-        else:
-            self.bias_stacked = None
-
         self.output_slices = (
             self.q_proj_shard_size,
             self.kv_proj_shard_size,
@@ -939,17 +1065,17 @@ def create_lora_weights(
         self.indices: torch.Tensor
         self.indices_len: List[int]
 
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_b_stacked[0][index] = 0
-        self.lora_a_stacked[1][index] = 0
-        self.lora_b_stacked[1][index] = 0
-        self.lora_a_stacked[2][index] = 0
-        self.lora_b_stacked[2][index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[0][index] = 0
-            self.bias_stacked[1][index] = 0
-            self.bias_stacked[2][index] = 0
+    # def reset_lora(self, index: int):
+    #     self.lora_a_stacked[0][index] = 0
+    #     self.lora_b_stacked[0][index] = 0
+    #     self.lora_a_stacked[1][index] = 0
+    #     self.lora_b_stacked[1][index] = 0
+    #     self.lora_a_stacked[2][index] = 0
+    #     self.lora_b_stacked[2][index] = 0
+    #     if self.lora_config.bias_enabled:
+    #         self.bias_stacked[0][index] = 0
+    #         self.bias_stacked[1][index] = 0
+    #         self.bias_stacked[2][index] = 0
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -1040,6 +1166,8 @@ def set_lora(
                     lora_a[2].T, non_blocking=True)
 
         if bias is not None:
+            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                     self.bias_stacked)
             if bias[0] is not None:
                 self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(
                     bias[0].T, non_blocking=True)
@@ -1050,15 +1178,15 @@ def set_lora(
                 self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
                     bias[2].T, non_blocking=True)
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora_packed_nslice(output, x,
-                                                   self.lora_a_stacked,
-                                                   self.lora_b_stacked,
-                                                   self.bias_stacked, 1.0,
-                                                   self.output_slices)
-        return output
+    # def apply(self, x: torch.Tensor,
+    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
+    #                                                self.lora_a_stacked,
+    #                                                self.lora_b_stacked,
+    #                                                self.bias_stacked, 1.0,
+    #                                                self.output_slices)
+    #     return output
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -1073,70 +1201,83 @@ def can_replace_layer(
                 and len(packed_modules_list) == 3)
 
 
-class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
+    #     def __init__(self,base_layer,n_slices):
+    # self.base_layer = base_layer
+    # self.input_size = self.base_layer.input_size
+    # self.device = _get_lora_device(self.base_layer)
+    # self.output_slices: Tuple[int, ...]
+    # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
+    # self.n_slices = n_slices
     def __init__(self, base_layer: RowParallelLinear) -> None:
-        super().__init__()
-        self.base_layer = base_layer
+        super().__init__(base_layer)
+
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size = self.base_layer.input_size_per_partition
         self.output_size = self.base_layer.output_size
-        self.device = _get_lora_device(self.base_layer)
-
-    def create_lora_weights(
-        self,
-        max_loras: int,
-        lora_config: LoRAConfig,
-        model_config: Optional[PretrainedConfig] = None,
-    ) -> None:
-        self.lora_config = lora_config
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_config.max_lora_rank,
-                self.input_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        tp_size = get_tensor_model_parallel_world_size()
-        lora_b_output_size_per_partition = (
-            self.output_size if not lora_config.fully_sharded_loras else
-            divide(self.output_size, tp_size))
-
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_b_output_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-
-        if lora_config.bias_enabled:
-            self.bias_stacked = torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    self.output_size,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            )
-        else:
-            self.bias_stacked = None
-        # Lazily initialized
-        self.indices: torch.Tensor
-        self.indices_len: List[int]
-
-    def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
-        if self.lora_config.bias_enabled:
-            self.bias_stacked[index] = 0
+        self.n_slices = 1
+
+    # def create_lora_weights(
+    #     self,
+    #     max_loras: int,
+    #     lora_config: LoRAConfig,
+    #     model_config: Optional[PretrainedConfig] = None,
+    # ) -> None:
+    #     self.lora_config = lora_config
+    #     self.tp_rank = get_tensor_model_parallel_rank()
+    #     self.lora_a_stacked = tuple(
+    #         torch.zeros(
+    #             (
+    #                 max_loras,
+    #                 1,
+    #                 lora_config.max_lora_rank,
+    #                 self.input_size,
+    #             ),
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+    #     tp_size = get_tensor_model_parallel_world_size()
+    #     lora_b_output_size_per_partition = (
+    #         self.output_size if not lora_config.fully_sharded_loras else
+    #         divide(self.output_size, tp_size))
+
+    #     self.lora_b_stacked = tuple(
+    #         torch.zeros(
+    #             (
+    #                 max_loras,
+    #                 1,
+    #                 lora_b_output_size_per_partition,
+    #                 lora_config.max_lora_rank,
+    #             ),
+    #             dtype=lora_config.lora_dtype,
+    #             device=self.device,
+    #         ) for _ in range(self.n_slices))
+
+    #     if lora_config.bias_enabled:
+    #         self.bias_stacked = tuple(
+    #             torch.zeros(
+    #                 (
+    #                     max_loras,
+    #                     1,
+    #                     self.output_size,
+    #                 ),
+    #                 dtype=lora_config.lora_dtype,
+    #                 device=self.device,
+    #             ) for _ in range(self.n_slices))
+    #     # Lazily initialized
+    #     self.output_slices = (self.lora_b_stacked[0].shape[2], )
+    #     self.indices: torch.Tensor
+    #     self.indices_len: List[int]
+
+    # def reset_lora(self, index: int):
+    #     for s_index in range(self.n_slices):
+    #         self.lora_a_stacked[s_index][index] = 0
+    #         self.lora_b_stacked[s_index][index] = 0
+    #         if self.lora_config.bias_enabled:
+    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                      self.bias_stacked)
+    #             self.bias_stacked[s_index][index] = 0
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
@@ -1152,39 +1293,43 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    def set_lora(
-        self,
-        index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
-    ):
-        self.reset_lora(index)
-
-        if self.base_layer.tp_size > 1:
-            lora_a = self.slice_lora_a(lora_a)
-            lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
-
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked[index,
-                              0, :bias.shape[0]].copy_(bias.T,
-                                                       non_blocking=True)
-
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
-        self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
-                                     self.lora_b_stacked, self.bias_stacked,
-                                     1.0)
-        return output
+    # def set_lora(
+    #     self,
+    #     index: int,
+    #     lora_a: torch.Tensor,
+    #     lora_b: torch.Tensor,
+    #     embeddings_tensor: Optional[torch.Tensor],
+    #     bias: Optional[torch.Tensor] = None,
+    # ):
+    #     self.reset_lora(index)
+
+    #     if self.base_layer.tp_size > 1:
+    #         lora_a = self.slice_lora_a(lora_a)
+    #         lora_b = self.slice_lora_b(lora_b)
+    #         if bias is not None:
+    #             bias = self.slice_bias(bias)
+
+    #     self.lora_a_stacked[0][index,
+    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+    #                                lora_a.T, non_blocking=True)
+    #     self.lora_b_stacked[0][index,
+    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+    #                                lora_b.T, non_blocking=True)
+    #     if bias is not None:
+    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
+    #                                  self.bias_stacked)
+    #         self.bias_stacked[0][index,
+    #                              0, :bias.shape[0]].copy_(bias.T,
+    #                                                       non_blocking=True)
+
+    # def apply(self, x: torch.Tensor) -> torch.Tensor:
+    #     output = self.base_layer.quant_method.apply(self.base_layer, x)
+    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
+    #                                                self.lora_a_stacked,
+    #                                                self.lora_b_stacked,
+    #                                                self.bias_stacked, 1.0,
+    #                                                self.output_slices)
+    #     return output
 
     def forward(self, input_):
         """Forward of RowParallelLinear
@@ -1267,6 +1412,7 @@ def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.sharded_to_full_mapping = sharded_to_full_mapping
+        self.n_slices = 1
 
     @property
     def logits_as_input(self):
@@ -1310,29 +1456,32 @@ def create_lora_weights(
         if 32000 < self.base_layer.vocab_size > 257024:
             raise ValueError("When using LoRA, vocab size must be "
                              "32000 >= vocab_size <= 257024")
-        self.lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                lora_config.max_lora_rank,
-                self.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                1,
-                # Pad for kernel compatibility
-                math.ceil(self.base_layer.vocab_size /
-                          lora_config.lora_vocab_padding_size) *
-                lora_config.lora_vocab_padding_size,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
+
+        self.lora_a_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    1,
+                    lora_config.max_lora_rank,
+                    self.hidden_size,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
+        self.lora_b_stacked = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    1,
+                    # Pad for kernel compatibility
+                    math.ceil(self.base_layer.vocab_size /
+                              lora_config.lora_vocab_padding_size) *
+                    lora_config.lora_vocab_padding_size,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ) for _ in range(self.n_slices))
         self.embeddings_tensors = torch.full(
             (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
             fill_value=float("-inf"),
@@ -1346,10 +1495,11 @@ def create_lora_weights(
                 dtype=torch.long)
         else:
             self.sharded_to_full_mapping_gpu = None
+        self.output_slices = (self.lora_b_stacked[0].shape[2], )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[index] = 0
-        self.lora_b_stacked[index] = 0
+        self.lora_a_stacked[0][index] = 0
+        self.lora_b_stacked[0][index] = 0
         self.embeddings_tensors[index] = float("-inf")
 
     def set_lora(
@@ -1361,12 +1511,12 @@ def set_lora(
         bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
-        self.lora_a_stacked[index,
-                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                lora_a.T, non_blocking=True)
-        self.lora_b_stacked[index,
-                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                lora_b.T, non_blocking=True)
+        self.lora_a_stacked[0][index,
+                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                   lora_a.T, non_blocking=True)
+        self.lora_b_stacked[0][index,
+                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                   lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
@@ -1430,8 +1580,8 @@ def _get_logits(
 
         # LogitsProcessorWithLoRA always using bgmv
         self.punica_wrapper.add_lora_logits(logits, hidden_states,
-                                            self.lora_a_stacked,
-                                            self.lora_b_stacked, 1.0)
+                                            self.lora_a_stacked[0],
+                                            self.lora_b_stacked[0], 1.0)
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 2ffefe61427e3..9855b57d0c9c9 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -555,17 +555,17 @@ def create_dummy_lora(
                         input_dim,
                         output_dim,
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         embeddings_tensor_dim=embeddings_tensor_dim,
                         bias_enabled=bias_enabled)
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
-                        module.lora_a_stacked.shape[-1],
-                        module.lora_b_stacked.shape[-2],
+                        module.lora_a_stacked[0].shape[-1],
+                        module.lora_b_stacked[0].shape[-2],
                         rank,
-                        module.lora_a_stacked.dtype,
+                        module.lora_a_stacked[0].dtype,
                         "cpu",
                         bias_enabled=bias_enabled,
                     )
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 4ebc92a949e84..f2fed6a485f64 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -596,7 +596,6 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
 
         y = y.view_as(y_org)
 
-
     def add_lora(self,
                  y: torch.Tensor,
                  x: torch.Tensor,
@@ -656,8 +655,8 @@ def add_lora(self,
     def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
                                lora_a_stacked: Tuple[torch.Tensor, ...],
                                lora_b_stacked: Tuple[torch.Tensor, ...],
-                               bias_all: Tuple[Optional[torch.Tensor],
-                                               ...], scale: float,
+                               bias_all: Optional[Tuple[torch.Tensor,
+                                                        ...]], scale: float,
                                output_slices: Tuple[int, ...]) -> None:
         """
         Applies lora to each input. Similar to add_lora, This method is 

From 0225059af2c176701d22642ff092d3af51255df4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 01:14:02 +0000
Subject: [PATCH 07/22] Modify layers.py

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 350 +-------------------------------------------
 1 file changed, 2 insertions(+), 348 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index f46a9470f61b7..0a25b7e97f8f0 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -381,87 +381,6 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
-    # def create_lora_weights(
-    #     self,
-    #     max_loras: int,
-    #     lora_config: LoRAConfig,
-    #     model_config: Optional[PretrainedConfig] = None,
-    # ) -> None:
-    #     self.lora_config = lora_config
-    #     lora_a_output_size_per_partition = (
-    #         lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-    #         else divide(lora_config.max_lora_rank, self.tp_size))
-    #     self.lora_a_stacked = tuple(
-    #         torch.zeros(
-    #             max_loras,
-    #             1,
-    #             lora_a_output_size_per_partition,
-    #             self.input_size,
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-    #     self.lora_b_stacked = tuple(
-    #         torch.zeros(
-    #             max_loras,
-    #             1,
-    #             self.output_size,
-    #             lora_config.max_lora_rank,
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-    #     if lora_config.bias_enabled:
-    #         self.bias_stacked = tuple(
-    #             torch.zeros(
-    #                 max_loras,
-    #                 1,
-    #                 self.output_size,
-    #                 dtype=lora_config.lora_dtype,
-    #                 device=self.device,
-    #             ) for _ in range(self.n_slices))
-    #     self.output_slices = (self.lora_b_stacked[0].shape[2], )
-
-    # def reset_lora(self, index: int):
-    #     for s_index in range(self.n_slices):
-    #         self.lora_a_stacked[s_index][index] = 0
-    #         self.lora_b_stacked[s_index][index] = 0
-    #         if self.lora_config.bias_enabled:
-    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                      self.bias_stacked)
-    #             self.bias_stacked[s_index][index] = 0
-
-    # def set_lora(
-    #     self,
-    #     index: int,
-    #     lora_a: torch.Tensor,
-    #     lora_b: torch.Tensor,
-    #     embeddings_tensor: Optional[torch.Tensor],
-    #     bias: Optional[torch.Tensor] = None,
-    # ):
-    #     self.reset_lora(index)
-
-    #     self.lora_a_stacked[0][index,
-    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-    #                                lora_a.T, non_blocking=True)
-    #     self.lora_b_stacked[0][index,
-    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-    #                                lora_b.T, non_blocking=True)
-    #     if bias is not None:
-    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                  self.bias_stacked)
-    #         self.bias_stacked[0][index,
-    #                              0, :bias.shape[0]].copy_(bias.T,
-    #                                                       non_blocking=True)
-
-    # def apply(self, x: torch.Tensor,
-    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
-    #                                                self.lora_a_stacked,
-    #                                                self.lora_b_stacked,
-    #                                                self.bias_stacked, 1.0,
-    #                                                self.output_slices)
-    #     return output
-
     def forward(self, input_):
         """Forward of ReplicatedLinearWithLoRA
 
@@ -511,59 +430,6 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None:
         self.tp_size = get_tensor_model_parallel_world_size()
         self.output_size = self.base_layer.output_size_per_partition
         self.n_slices = 1
-        # self.output_slices: Tuple[int, ...]
-        # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
-        # self.n_slices = 1
-
-    # def create_lora_weights(
-    #     self,
-    #     max_loras: int,
-    #     lora_config: LoRAConfig,
-    #     model_config: Optional[PretrainedConfig] = None,
-    # ) -> None:
-    #     self.lora_config = lora_config
-    #     lora_a_output_size_per_partition = (
-    #         lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-    #         else divide(lora_config.max_lora_rank, self.tp_size))
-    #     self.lora_a_stacked = tuple(
-    #         torch.zeros(
-    #             max_loras,
-    #             1,
-    #             lora_a_output_size_per_partition,
-    #             self.input_size,
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-    #     self.lora_b_stacked = tuple(
-    #         torch.zeros(
-    #             max_loras,
-    #             1,
-    #             self.output_size,
-    #             lora_config.max_lora_rank,
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-
-    #     if lora_config.bias_enabled:
-    #         self.bias_stacked = tuple(
-    #             torch.zeros(
-    #                 max_loras,
-    #                 1,
-    #                 self.output_size,
-    #                 dtype=lora_config.lora_dtype,
-    #                 device=self.device,
-    #             ) for _ in range(self.n_slices))
-    #     self.output_dim = self.lora_b_stacked[0].shape[2]
-    #     self.output_slices = (self.output_dim, )
-
-    # def reset_lora(self, index: int):
-    #     for s_index in range(self.n_slices):
-    #         self.lora_a_stacked[s_index][index] = 0
-    #         self.lora_b_stacked[s_index][index] = 0
-    #         if self.lora_config.bias_enabled:
-    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                      self.bias_stacked)
-    #             self.bias_stacked[s_index][index] = 0
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         return lora_a
@@ -602,45 +468,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         bias = bias[start_idx:end_idx]
         return bias
 
-    # def set_lora(
-    #     self,
-    #     index: int,
-    #     lora_a: torch.Tensor,
-    #     lora_b: torch.Tensor,
-    #     embeddings_tensor: Optional[torch.Tensor],
-    #     bias: Optional[torch.Tensor] = None,
-    # ):
-    #     self.reset_lora(index)
-
-    #     if self.tp_size > 1:
-    #         lora_a = self.slice_lora_a(lora_a)
-    #         lora_b = self.slice_lora_b(lora_b)
-    #         if bias is not None:
-    #             bias = self.slice_bias(bias)
-
-    #     self.lora_a_stacked[0][index,
-    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-    #                                lora_a.T, non_blocking=True)
-    #     self.lora_b_stacked[0][index,
-    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-    #                                lora_b.T, non_blocking=True)
-    #     if bias is not None:
-    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                  self.bias_stacked)
-    #         self.bias_stacked[0][index,
-    #                              0, :bias.shape[0]].copy_(bias.T,
-    #                                                       non_blocking=True)
-
-    # def apply(self, x: torch.Tensor,
-    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
-    #                                                self.lora_a_stacked,
-    #                                                self.lora_b_stacked,
-    #                                                self.bias_stacked, 1.0,
-    #                                                self.output_slices)
-    #     return output
-
     def forward(self, input_):
         """Forward of ColumnParallelLinear
 
@@ -743,15 +570,6 @@ def create_lora_weights(
         self.output_dim = self.lora_b_stacked[0].shape[2]
         self.output_slices = (self.output_dim, self.output_dim)
 
-    # def reset_lora(self, index: int):
-    #     self.lora_a_stacked[0][index] = 0
-    #     self.lora_a_stacked[1][index] = 0
-    #     self.lora_b_stacked[0][index] = 0
-    #     self.lora_b_stacked[1][index] = 0
-    #     if self.lora_config.bias_enabled:
-    #         self.bias_stacked[0][index] = 0
-    #         self.bias_stacked[1][index] = 0
-
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
@@ -826,14 +644,6 @@ def set_lora(
                                  0, :bias[1].shape[0]].copy_(bias[1].T,
                                                              non_blocking=True)
 
-    # def apply(self, x: torch.Tensor,
-    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-    #     self.punica_wrapper.add_lora_packed_nslice(
-    #         output, x, self.lora_a_stacked, self.lora_b_stacked,
-    #         self.bias_stacked, 1.0, self.output_slices)
-    #     return output
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(
@@ -862,7 +672,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
-        self.tp_size = get_tensor_model_parallel_world_size()
         self.q_proj_total_size = (self.base_layer.total_num_heads *
                                   self.base_layer.head_size)
         self.q_proj_shard_size = (self.base_layer.num_heads *
@@ -905,35 +714,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
                       self.kv_proj_shard_size * (self.kv_shard_id + 1)]
         bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
         return bias
-
-    # def set_lora(
-    #     self,
-    #     index: int,
-    #     lora_a: torch.Tensor,
-    #     lora_b: torch.Tensor,
-    #     embeddings_tensor: Optional[torch.Tensor],
-    #     bias: Optional[torch.Tensor] = None,
-    # ):
-    #     self.reset_lora(index)
-    #     if self.tp_size > 1:
-    #         lora_a = self.slice_lora_a(lora_a)
-    #         lora_b = self.slice_lora_b(lora_b)
-    #         if bias is not None:
-    #             bias = self.slice_bias(bias)
-
-    #     self.lora_a_stacked[0][index,
-    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-    #                                lora_a.T, non_blocking=True)
-    #     self.lora_b_stacked[0][index,
-    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-    #                                lora_b.T, non_blocking=True)
-    #     if bias is not None:
-    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                  self.bias_stacked)
-    #         self.bias_stacked[0][index,
-    #                              0, :bias.shape[0]].copy_(bias.T,
-    #                                                       non_blocking=True)
-
+    
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
@@ -1065,18 +846,6 @@ def create_lora_weights(
         self.indices: torch.Tensor
         self.indices_len: List[int]
 
-    # def reset_lora(self, index: int):
-    #     self.lora_a_stacked[0][index] = 0
-    #     self.lora_b_stacked[0][index] = 0
-    #     self.lora_a_stacked[1][index] = 0
-    #     self.lora_b_stacked[1][index] = 0
-    #     self.lora_a_stacked[2][index] = 0
-    #     self.lora_b_stacked[2][index] = 0
-    #     if self.lora_config.bias_enabled:
-    #         self.bias_stacked[0][index] = 0
-    #         self.bias_stacked[1][index] = 0
-    #         self.bias_stacked[2][index] = 0
-
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
@@ -1178,16 +947,6 @@ def set_lora(
                 self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
                     bias[2].T, non_blocking=True)
 
-    # def apply(self, x: torch.Tensor,
-    #           bias: Optional[torch.Tensor]) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
-    #                                                self.lora_a_stacked,
-    #                                                self.lora_b_stacked,
-    #                                                self.bias_stacked, 1.0,
-    #                                                self.output_slices)
-    #     return output
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(
@@ -1203,82 +962,15 @@ def can_replace_layer(
 
 class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
 
-    #     def __init__(self,base_layer,n_slices):
-    # self.base_layer = base_layer
-    # self.input_size = self.base_layer.input_size
-    # self.device = _get_lora_device(self.base_layer)
-    # self.output_slices: Tuple[int, ...]
-    # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
-    # self.n_slices = n_slices
     def __init__(self, base_layer: RowParallelLinear) -> None:
         super().__init__(base_layer)
 
         self.tp_size = get_tensor_model_parallel_world_size()
+        # reset input_size
         self.input_size = self.base_layer.input_size_per_partition
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
-    # def create_lora_weights(
-    #     self,
-    #     max_loras: int,
-    #     lora_config: LoRAConfig,
-    #     model_config: Optional[PretrainedConfig] = None,
-    # ) -> None:
-    #     self.lora_config = lora_config
-    #     self.tp_rank = get_tensor_model_parallel_rank()
-    #     self.lora_a_stacked = tuple(
-    #         torch.zeros(
-    #             (
-    #                 max_loras,
-    #                 1,
-    #                 lora_config.max_lora_rank,
-    #                 self.input_size,
-    #             ),
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-    #     tp_size = get_tensor_model_parallel_world_size()
-    #     lora_b_output_size_per_partition = (
-    #         self.output_size if not lora_config.fully_sharded_loras else
-    #         divide(self.output_size, tp_size))
-
-    #     self.lora_b_stacked = tuple(
-    #         torch.zeros(
-    #             (
-    #                 max_loras,
-    #                 1,
-    #                 lora_b_output_size_per_partition,
-    #                 lora_config.max_lora_rank,
-    #             ),
-    #             dtype=lora_config.lora_dtype,
-    #             device=self.device,
-    #         ) for _ in range(self.n_slices))
-
-    #     if lora_config.bias_enabled:
-    #         self.bias_stacked = tuple(
-    #             torch.zeros(
-    #                 (
-    #                     max_loras,
-    #                     1,
-    #                     self.output_size,
-    #                 ),
-    #                 dtype=lora_config.lora_dtype,
-    #                 device=self.device,
-    #             ) for _ in range(self.n_slices))
-    #     # Lazily initialized
-    #     self.output_slices = (self.lora_b_stacked[0].shape[2], )
-    #     self.indices: torch.Tensor
-    #     self.indices_len: List[int]
-
-    # def reset_lora(self, index: int):
-    #     for s_index in range(self.n_slices):
-    #         self.lora_a_stacked[s_index][index] = 0
-    #         self.lora_b_stacked[s_index][index] = 0
-    #         if self.lora_config.bias_enabled:
-    #             self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                      self.bias_stacked)
-    #             self.bias_stacked[s_index][index] = 0
-
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
         shard_size = self.input_size
@@ -1293,44 +985,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         return bias
 
-    # def set_lora(
-    #     self,
-    #     index: int,
-    #     lora_a: torch.Tensor,
-    #     lora_b: torch.Tensor,
-    #     embeddings_tensor: Optional[torch.Tensor],
-    #     bias: Optional[torch.Tensor] = None,
-    # ):
-    #     self.reset_lora(index)
-
-    #     if self.base_layer.tp_size > 1:
-    #         lora_a = self.slice_lora_a(lora_a)
-    #         lora_b = self.slice_lora_b(lora_b)
-    #         if bias is not None:
-    #             bias = self.slice_bias(bias)
-
-    #     self.lora_a_stacked[0][index,
-    #                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-    #                                lora_a.T, non_blocking=True)
-    #     self.lora_b_stacked[0][index,
-    #                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-    #                                lora_b.T, non_blocking=True)
-    #     if bias is not None:
-    #         self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-    #                                  self.bias_stacked)
-    #         self.bias_stacked[0][index,
-    #                              0, :bias.shape[0]].copy_(bias.T,
-    #                                                       non_blocking=True)
-
-    # def apply(self, x: torch.Tensor) -> torch.Tensor:
-    #     output = self.base_layer.quant_method.apply(self.base_layer, x)
-    #     self.punica_wrapper.add_lora_packed_nslice(output, x,
-    #                                                self.lora_a_stacked,
-    #                                                self.lora_b_stacked,
-    #                                                self.bias_stacked, 1.0,
-    #                                                self.output_slices)
-    #     return output
-
     def forward(self, input_):
         """Forward of RowParallelLinear
 

From 88366e8e4117d0a23ed343255d6ee8e205384e18 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 01:27:01 +0000
Subject: [PATCH 08/22] Revert embedding and logits layer

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 139 +++++++++++++++++++++-----------------------
 1 file changed, 65 insertions(+), 74 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 0a25b7e97f8f0..e24a321680fdb 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -136,7 +136,6 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None:
         self.base_layer = base_layer
         self.embeddings_slice: Optional[Tuple[int, int]]
         self.embeddings_weights: Optional[torch.Tensor]
-        self.n_slices = 1
 
     def create_lora_weights(
             self,
@@ -170,36 +169,34 @@ def create_lora_weights(
             dtype=self.base_layer.weight.dtype,
             device=self.base_layer.weight.device,
         )
-        self.lora_a_stacked = tuple(
-            torch.zeros(
-                (
-                    max_loras,
-                    self.base_layer.org_vocab_size +
-                    lora_config.lora_extra_vocab_size,
-                    lora_config.max_lora_rank,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.base_layer.weight.device,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    self.base_layer.embedding_dim,
-                    lora_config.max_lora_rank,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.base_layer.weight.device,
-            ) for _ in range(self.n_slices))
-        self.lora_a_stacked_2d = self.lora_a_stacked[0].view(
-            self.lora_a_stacked[0].shape[0] * self.lora_a_stacked[0].shape[1],
-            self.lora_a_stacked[0].shape[2],
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                self.base_layer.org_vocab_size +
+                lora_config.lora_extra_vocab_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                self.base_layer.embedding_dim,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.base_layer.weight.device,
+        )
+        self.lora_a_stacked_2d = self.lora_a_stacked.view(
+            self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
+            self.lora_a_stacked.shape[2],
         )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_b_stacked[0][index] = 0
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
         self.embeddings_tensors[index] = 0
 
     def set_lora(
@@ -211,12 +208,11 @@ def set_lora(
         bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
-        self.lora_a_stacked[0][
-            index, :lora_a.shape[0], :lora_a.shape[1]].copy_(lora_a,
-                                                             non_blocking=True)
-        self.lora_b_stacked[0][index,
-                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                   lora_b.T, non_blocking=True)
+        self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
@@ -258,7 +254,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Embedding layer only need expand op
         self.punica_wrapper.add_expand(full_output,
                                        full_lora_a_embeddings,
-                                       self.lora_b_stacked[0],
+                                       self.lora_b_stacked,
                                        bias_all=None,
                                        add_input=True)
         return full_output.view_as(full_output_org)
@@ -714,7 +710,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
                       self.kv_proj_shard_size * (self.kv_shard_id + 1)]
         bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
         return bias
-    
+
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(cls, source_layer: nn.Module,
@@ -1066,7 +1062,6 @@ def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
         self.sharded_to_full_mapping = sharded_to_full_mapping
-        self.n_slices = 1
 
     @property
     def logits_as_input(self):
@@ -1110,32 +1105,29 @@ def create_lora_weights(
         if 32000 < self.base_layer.vocab_size > 257024:
             raise ValueError("When using LoRA, vocab size must be "
                              "32000 >= vocab_size <= 257024")
-
-        self.lora_a_stacked = tuple(
-            torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    lora_config.max_lora_rank,
-                    self.hidden_size,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for _ in range(self.n_slices))
-        self.lora_b_stacked = tuple(
-            torch.zeros(
-                (
-                    max_loras,
-                    1,
-                    # Pad for kernel compatibility
-                    math.ceil(self.base_layer.vocab_size /
-                              lora_config.lora_vocab_padding_size) *
-                    lora_config.lora_vocab_padding_size,
-                    lora_config.max_lora_rank,
-                ),
-                dtype=lora_config.lora_dtype,
-                device=self.device,
-            ) for _ in range(self.n_slices))
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                lora_config.max_lora_rank,
+                self.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                # Pad for kernel compatibility
+                math.ceil(self.base_layer.vocab_size /
+                          lora_config.lora_vocab_padding_size) *
+                lora_config.lora_vocab_padding_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
         self.embeddings_tensors = torch.full(
             (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
             fill_value=float("-inf"),
@@ -1149,11 +1141,10 @@ def create_lora_weights(
                 dtype=torch.long)
         else:
             self.sharded_to_full_mapping_gpu = None
-        self.output_slices = (self.lora_b_stacked[0].shape[2], )
 
     def reset_lora(self, index: int):
-        self.lora_a_stacked[0][index] = 0
-        self.lora_b_stacked[0][index] = 0
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
         self.embeddings_tensors[index] = float("-inf")
 
     def set_lora(
@@ -1165,12 +1156,12 @@ def set_lora(
         bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
-        self.lora_a_stacked[0][index,
-                               0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
-                                   lora_a.T, non_blocking=True)
-        self.lora_b_stacked[0][index,
-                               0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
-                                   lora_b.T, non_blocking=True)
+        self.lora_a_stacked[index,
+                            0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
+                                lora_a.T, non_blocking=True)
+        self.lora_b_stacked[index,
+                            0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
+                                lora_b.T, non_blocking=True)
         if embeddings_tensor is not None:
             self.embeddings_tensors[
                 index, :embeddings_tensor.shape[0], :embeddings_tensor.
@@ -1234,8 +1225,8 @@ def _get_logits(
 
         # LogitsProcessorWithLoRA always using bgmv
         self.punica_wrapper.add_lora_logits(logits, hidden_states,
-                                            self.lora_a_stacked[0],
-                                            self.lora_b_stacked[0], 1.0)
+                                            self.lora_a_stacked,
+                                            self.lora_b_stacked, 1.0)
 
         # Remove paddings in vocab (if any).
         logits = logits[:, :self.base_layer.vocab_size]

From b446a3c0a8dab537ca6d92af1c2b3dbb78c5f573 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 08:35:22 +0000
Subject: [PATCH 09/22] Modify layers.py and fs_layers.py

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py | 126 +++++++++++++-----------
 vllm/lora/layers.py               |  69 ++++++++-----
 vllm/lora/punica.py               | 156 +++++++++++++++++++++++-------
 3 files changed, 233 insertions(+), 118 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 34d4ffbdb1778..c7a13f83f48eb 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -1,5 +1,5 @@
 # pylint: disable=unused-argument
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
 
 import torch
 import torch.nn as nn
@@ -51,30 +51,35 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
     # gather operation.
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
+        # TODO add doc
         buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
             dtype=torch.float32,
             device=x.device,
         )
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
+                                                     self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       self.bias_stacked,
-                                       add_input=True)
+        self.punica_wrapper.add_expand_packed_nslice(output,
+                                                     buffer,
+                                                     self.lora_b_stacked,
+                                                     self.bias_stacked,
+                                                     self.output_slices,
+                                                     add_input=True)
+
         # now have column partitioned output
         output = output.view(*out_orig_shape)
         return output
@@ -109,29 +114,25 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     MergedColumnParallelLinearWithShardedLoRA.
     """
     # expecting 2 for column parallel and 3 for qkv
-    n = len(layer.lora_a_stacked)
+    assert len(layer.lora_a_stacked) == layer.n_slices
     output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
 
     x = x.view(-1, x.shape[-1])
     output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
     buffers = torch.zeros(
-        (n, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
         dtype=torch.float32,
         device=x.device,
     )
-    for idx in range(n):
-        layer.punica_wrapper.add_shrink(buffers[idx], x,
-                                        layer.lora_a_stacked[idx], 1.0)
-
+    layer.punica_wrapper.add_shrink_packed_nslice(buffers, x,
+                                                  layer.lora_a_stacked, 1.0)
     buffers = tensor_model_parallel_all_gather(buffers)
-    layer.punica_wrapper.add_expand_packed_nslice(
-        output,
-        buffers,
-        layer.lora_b_stacked,
-        layer.bias_stacked,
-        1.0,
-        layer.output_slices,
-    )
+    layer.punica_wrapper.add_expand_packed_nslice(output,
+                                                  buffers,
+                                                  layer.lora_b_stacked,
+                                                  layer.bias_stacked,
+                                                  layer.output_slices,
+                                                  add_input=True)
 
     output = output.view(*out_orig_shape)
     # now have column partitioned and packed output
@@ -161,8 +162,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -194,28 +196,33 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tp_rank = get_tensor_model_parallel_rank()
-        shard_size = self.lora_a_stacked.shape[2]
+        shard_size = self.lora_a_stacked[0].shape[2]
         start_idx = tp_rank * shard_size
         lora_a = lora_a[:, start_idx:start_idx + shard_size]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
-        buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]),
-                             dtype=torch.float32,
-                             device=x.device)
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+        buffer = torch.zeros(
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
+                                                     self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand(output,
-                                       buffer,
-                                       self.lora_b_stacked,
-                                       self.bias_stacked,
-                                       add_input=True)
+        self.punica_wrapper.add_expand_packed_nslice(output,
+                                                     buffer,
+                                                     self.lora_b_stacked,
+                                                     self.bias_stacked,
+                                                     self.output_slices,
+                                                     add_input=True)
         # now have column partitioned output
         output = output.view(*out_orig_shape)
         return output
@@ -259,8 +266,9 @@ def slice_lora_a(
         ]
         return lora_a
 
-    def apply(self, x: torch.Tensor,
-              bias: Optional[torch.Tensor]) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
 
     @classmethod
@@ -293,7 +301,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
     """
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
-        shard_size = self.lora_b_stacked.shape[2]
+        shard_size = self.lora_b_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = lora_b[:, start_idx:end_idx]
@@ -302,25 +310,29 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
-        shard_size = self.bias_stacked.shape[2]
+        self.bias_stacked = cast(Tuple[torch.Tensor, ...], self.bias_stacked)
+        shard_size = self.bias_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]
         return bias
 
-    def apply(self, x: torch.Tensor) -> torch.Tensor:
+    def apply(self,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1,
                                              output.shape[-1]), output.shape
         buffer = torch.zeros(
-            (x.shape[0], self.lora_a_stacked.shape[2]),
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
             dtype=torch.float32,
             device=x.device,
         )
 
-        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
+        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
+                                                     self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -329,19 +341,15 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # remains is a standard all_reduce. User should be aware though that
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
-        shard_size = self.lora_b_stacked.shape[2]
-
-        # To be compatible with the input of the add_expand_packed_nslice,
-        # there is only one slice.
-        buffer = buffer.unsqueeze(dim=0)
-        self.punica_wrapper.add_expand_packed_nslice(
-            output,
-            buffer,
-            (self.lora_b_stacked, ),
-            (self.bias_stacked, ) if self.bias_stacked is not None else None,
-            1.0,
-            (shard_size, ),
-        )
+
+        # TODO:add DOC
+        buffer = buffer.squeeze(dim=0)
+        shard_size = self.lora_b_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        self.punica_wrapper.add_expand_slice(
+            output, buffer, self.lora_b_stacked[0],
+            self.bias_stacked[0] if self.bias_stacked is not None else None,
+            start_idx, shard_size)
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index e24a321680fdb..1c63e300b7838 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -250,13 +250,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
                 full_lora_a_embeddings.shape[1],
                 -1,
             )
-
-        # Embedding layer only need expand op
-        self.punica_wrapper.add_expand(full_output,
-                                       full_lora_a_embeddings,
-                                       self.lora_b_stacked,
-                                       bias_all=None,
-                                       add_input=True)
+        self.punica_wrapper.add_lora_embedding(full_output,
+                                               full_lora_a_embeddings,
+                                               self.lora_b_stacked,
+                                               add_input=True)
         return full_output.view_as(full_output_org)
 
     @classmethod
@@ -277,9 +274,11 @@ def __init__(self, base_layer: LinearBase):
         self.base_layer = base_layer
         self.input_size = self.base_layer.input_size
         self.device = _get_lora_device(self.base_layer)
-        self.output_slices: Tuple[int, ...]
         self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
 
+        self.output_slices: Tuple[int, ...]
+        self.tp_size: int
+
     def create_lora_weights(
         self,
         max_loras: int,
@@ -287,14 +286,31 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-        lora_a_output_size_per_partition = (
-            lora_config.max_lora_rank if not lora_config.fully_sharded_loras
-            else divide(lora_config.max_lora_rank, self.tp_size))
+
+        if isinstance(self.base_layer, ReplicatedLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, ColumnParallelLinear):
+            lora_a_out_size = (lora_config.max_lora_rank if
+                               not lora_config.fully_sharded_loras else divide(
+                                   lora_config.max_lora_rank, self.tp_size))
+            lora_b_out_size = self.output_size
+
+        elif isinstance(self.base_layer, RowParallelLinear):
+            lora_a_out_size = lora_config.max_lora_rank
+            lora_b_out_size = (self.output_size if
+                               not lora_config.fully_sharded_loras else divide(
+                                   self.output_size, self.tp_size))
+        else:
+            raise NotImplementedError
+
+        lora_bias_out_size = self.output_size
         self.lora_a_stacked = tuple(
             torch.zeros(
                 max_loras,
                 1,
-                lora_a_output_size_per_partition,
+                lora_a_out_size,
                 self.input_size,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
@@ -303,7 +319,7 @@ def create_lora_weights(
             torch.zeros(
                 max_loras,
                 1,
-                self.output_size,
+                lora_b_out_size,
                 lora_config.max_lora_rank,
                 dtype=lora_config.lora_dtype,
                 device=self.device,
@@ -313,7 +329,7 @@ def create_lora_weights(
                 torch.zeros(
                     max_loras,
                     1,
-                    self.output_size,
+                    lora_bias_out_size,
                     dtype=lora_config.lora_dtype,
                     device=self.device,
                 ) for _ in range(self.n_slices))
@@ -337,7 +353,6 @@ def set_lora(
         bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
-
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
@@ -373,7 +388,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
     def __init__(self, base_layer: ReplicatedLinear) -> None:
         super().__init__(base_layer, )
-        self.tp_size = 1  #To ensure interface compatibility, it is set to 1
+        # To ensure interface compatibility, set to 1 always.
+        self.tp_size = 1
+
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
@@ -412,8 +429,10 @@ def can_replace_layer(
 class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
     """
     LoRA on top of ColumnParallelLinear layer.
-
     LoRA B is sliced for tensor parallelism.
+    There are two types for the `base_layer`:
+    1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`.
+    2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`.
     """
 
     def __init__(self, base_layer: ColumnParallelLinear) -> None:
@@ -425,6 +444,7 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None:
             base_layer) is MergedColumnParallelLinear
         self.tp_size = get_tensor_model_parallel_world_size()
         self.output_size = self.base_layer.output_size_per_partition
+        # There is only one LoRA layer
         self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
@@ -513,6 +533,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
+        # There are two LoRA layers
         self.n_slices = 2
 
     def create_lora_weights(
@@ -676,6 +697,7 @@ def __init__(self, base_layer: QKVParallelLinear) -> None:
                                    self.base_layer.head_size)
         self.kv_proj_total_size = (self.base_layer.total_num_kv_heads *
                                    self.base_layer.head_size)
+        # There is only one LoRA layer
         self.n_slices = 1
 
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
@@ -733,6 +755,7 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
 
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
+        # There are three LoRA layer.
         self.n_slices = 3
 
     def create_lora_weights(
@@ -965,13 +988,16 @@ def __init__(self, base_layer: RowParallelLinear) -> None:
         # reset input_size
         self.input_size = self.base_layer.input_size_per_partition
         self.output_size = self.base_layer.output_size
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        # There is only one LoRA layer.
         self.n_slices = 1
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
-        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+
         shard_size = self.input_size
-        start_idx = tensor_model_parallel_rank * shard_size
-        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
         lora_a = lora_a[start_idx:end_idx, :]
         return lora_a
 
@@ -998,10 +1024,9 @@ def forward(self, input_):
             input_parallel = input_
         else:
             # TODO: simplify code below
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.base_layer.tp_size)
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
         output_parallel = self.apply(input_parallel)
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index f2fed6a485f64..4b06c05a3e828 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -454,7 +454,7 @@ def apply_expand_slice(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
                            w_t_all: torch.Tensor,
-                           bias_all: Optional[torch.Tensor],
+                           bias_stacked: Optional[torch.Tensor],
                            y_offset: Optional[int],
                            y_slice_size: Optional[int],
                            add_input: bool = True):
@@ -463,8 +463,8 @@ def apply_expand_slice(self,
         computation, which is suitable for the
         GEMM of lora'b.
         """
-        if bias_all is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_all)
+        if bias_stacked is not None:
+            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
 
         expand_slice_fun: Callable = (self.expand_slice_prefill
                                       if self.is_prefill else
@@ -542,16 +542,39 @@ def add_shrink(
         Otherwise, it is the decode stage, and the shrink_decode function
         should be called.
         """
+        y_org = y
+        y = y.view(-1, y.shape[-1])
         shrink_fun: Callable = (self.shrink_prefill
                                 if self.is_prefill else self.shrink_decode)
         shrink_fun(y, x, w_t_all, scale)
+        y = y.view_as(y_org)
+
+    def add_shrink_packed_nslice(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+    ):
+        """
+        Perform the ` y[i]+=x@w_t_all[i]` computation, which is suitable for 
+        the GEMM of lora'a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the shrink_decode function
+        should be called.
+        """
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self.add_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale)
 
     def add_expand(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
         w_t_all: torch.Tensor,
-        bias_all: Optional[torch.Tensor],
+        bias_stacked: Optional[torch.Tensor],
         add_input: bool = True,
     ):
         """
@@ -562,19 +585,41 @@ def add_expand(
         Otherwise, it is the decode stage, and the expand_decode function
         should be called.
         """
-        if bias_all is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_all)
+        if bias_stacked is not None:
+            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
 
         expand_fun: Callable = (self.expand_prefill
                                 if self.is_prefill else self.expand_decode)
         expand_fun(y, x, w_t_all, add_input)
 
-    def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                                 lora_b_stacked: Tuple[torch.Tensor, ...],
-                                 bias_stacked: Optional[Tuple[torch.Tensor,
-                                                              ...]],
-                                 scale: float,
-                                 output_slices: Tuple[int, ...]) -> None:
+    def add_expand_slice(self,
+                         y: torch.Tensor,
+                         x: torch.Tensor,
+                         w_t_all: torch.Tensor,
+                         bias_stacked: Optional[torch.Tensor],
+                         y_offset: Optional[int],
+                         y_slice_size: Optional[int],
+                         add_input: bool = True):
+        """
+        Similar to `add_expand`
+        """
+        if bias_stacked is not None:
+            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
+
+        expand_slice_fun: Callable = (self.expand_slice_prefill
+                                      if self.is_prefill else
+                                      self.expand_slice_decode)
+        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
+
+    def add_expand_packed_nslice(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        add_input=True,
+    ) -> None:
         """
         Similar to `add_expand`
         """
@@ -591,17 +636,38 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
                                     None,
                                     offset_left,
                                     output_slices[slice_idx],
-                                    add_input=True)
+                                    add_input=add_input)
             offset_left += output_slices[slice_idx]
 
         y = y.view_as(y_org)
 
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        """
+        Perform the ` y+=x@w_t_all` computation, which is suitable for the
+        GEMM of lora'b or embedding layer's lora.
+        When `is_prefill` is true, it indicates that it is currently the
+        prefill stage, and the `expand_prefill` function should be called.
+        Otherwise, it is the decode stage, and the expand_decode function
+        should be called.
+        """
+
+        # Embedding layer only need expand op
+        expand_fun: Callable = (self.expand_prefill
+                                if self.is_prefill else self.expand_decode)
+        expand_fun(y, x, w_t_all, add_input)
+
     def add_lora(self,
                  y: torch.Tensor,
                  x: torch.Tensor,
                  wa_t_all: torch.Tensor,
                  wb_t_all: torch.Tensor,
-                 bias_all: Optional[torch.Tensor],
+                 bias_stacked: Optional[torch.Tensor],
                  scale: float,
                  y_offset: Optional[int] = None,
                  y_slice_size: Optional[int] = None,
@@ -620,7 +686,7 @@ def add_lora(self,
             x (torch.Tensor): Input tensor
             wa_t_all (torch.Tensor): lora_a's weight
             wb_t_all (torch.Tensor): lora_b's weight
-            bias_all: (torch.Tensor): lora's bias
+            bias_stacked: (torch.Tensor): lora's bias
             scale (float): Scaling factor.
             y_offset (Optional[int], optional): Offset to apply to the starting
                 column of y.
@@ -637,11 +703,15 @@ def add_lora(self,
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-        if bias_all is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_all)
+        if bias_stacked is not None:
+            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
         self.add_shrink(buffer, x, wa_t_all, scale)
         if y_offset is None and y_slice_size is None:
-            self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True)
+            self.add_expand(y,
+                            buffer,
+                            wb_t_all,
+                            bias_stacked=None,
+                            add_input=True)
         else:
             self.apply_expand_slice(y,
                                     buffer,
@@ -652,32 +722,44 @@ def add_lora(self,
                                     add_input=True)
         y = y.view_as(y_org)
 
-    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
-                               lora_a_stacked: Tuple[torch.Tensor, ...],
-                               lora_b_stacked: Tuple[torch.Tensor, ...],
-                               bias_all: Optional[Tuple[torch.Tensor,
-                                                        ...]], scale: float,
-                               output_slices: Tuple[int, ...]) -> None:
+    def add_lora_packed_nslice(
+            self,
+            y: torch.Tensor,
+            x: torch.Tensor,
+            lora_a_stacked: Tuple[torch.Tensor, ...],
+            lora_b_stacked: Tuple[torch.Tensor, ...],
+            bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+            scale: float,
+            output_slices: Tuple[int, ...],
+            *,
+            buffer: Optional[Tuple[torch.Tensor, ...]] = None) -> None:
         """
         Applies lora to each input. Similar to add_lora, This method is 
         used for layers that are composed of multiple sublayers
         (slices) packed together.
         """
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-        if bias_all is not None:
+
+        assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
+        if bias_stacked is not None:
+            assert len(bias_stacked) == len(output_slices)
             y = self.apply_bias_packed_nslice(self.token_lora_indices, y,
-                                              output_slices, bias_all)
-        # TODO fuse these kernels
-        for slice_idx in range(len(output_slices)):
-            self.add_lora(y, x, lora_a_stacked[slice_idx],
-                          lora_b_stacked[slice_idx], None, scale, offset_left,
-                          output_slices[slice_idx])
-            offset_left += output_slices[slice_idx]
+                                              output_slices, bias_stacked)
 
-        y = y.view_as(y_org)
+        if buffer is None:
+            r = lora_b_stacked[0].size(-1)
+            # We set the buffer to be float32 by default ,refer to:
+            # https://github.com/triton-lang/triton/issues/1387
+            buffer = tuple(
+                torch.zeros(
+                    (x.size(0), r), dtype=torch.float32, device=x.device)
+                for _ in range(len(output_slices)))
+        self.add_shrink_packed_nslice(buffer, x, lora_a_stacked, scale)
+        self.add_expand_packed_nslice(y,
+                                      buffer,
+                                      lora_b_stacked,
+                                      None,
+                                      output_slices,
+                                      add_input=True)
 
     def add_lora_logits(self,
                         y: torch.Tensor,

From dc5cb0bf6a08fddf78ad9812e86ea414f307f1db Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 09:01:53 +0000
Subject: [PATCH 10/22] Optimize fs_layer.py

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py | 126 ++++++++++--------------------
 1 file changed, 43 insertions(+), 83 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index c7a13f83f48eb..74b6f34155988 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -31,6 +31,46 @@ def dec(*args, **kwargs):
 
     return dec
 
+def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
+    """ 
+    For `ColumnParallelLinearWithLoRA` or classes that inherit from 
+    `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
+    """
+    assert (
+        layer.n_slices
+        == len(layer.lora_a_stacked)
+        == len(layer.lora_b_stacked)
+        == len(layer.output_slices)
+    )
+    if layer.bias_stacked is not None:
+        assert layer.n_slices==len(layer.bias_stacked)
+        
+    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
+
+    x = x.view(-1, x.shape[-1])
+    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+
+    # Since communication is needed, the buffer is directly initialized as a 
+    # tensor rather than a tuple of tensor.
+    buffers = torch.zeros(
+        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
+        dtype=torch.float32,
+        device=x.device,
+    )
+
+    layer.punica_wrapper.add_shrink_packed_nslice(buffers, x,
+                                                  layer.lora_a_stacked, 1.0)
+    buffers = tensor_model_parallel_all_gather(buffers)
+    layer.punica_wrapper.add_expand_packed_nslice(output,
+                                                  buffers,
+                                                  layer.lora_b_stacked,
+                                                  layer.bias_stacked,
+                                                  layer.output_slices,
+                                                  add_input=True)
+
+    output = output.view(*out_orig_shape)
+    # now have column partitioned and packed output
+    return output
 
 # these layers are based on the tensor parallelism strategy given in
 # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
@@ -59,30 +99,8 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
     def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        # TODO add doc
-        buffer = torch.zeros(
-            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
-            dtype=torch.float32,
-            device=x.device,
-        )
-        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
-                                                     self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand_packed_nslice(output,
-                                                     buffer,
-                                                     self.lora_b_stacked,
-                                                     self.bias_stacked,
-                                                     self.output_slices,
-                                                     add_input=True)
-
-        # now have column partitioned output
-        output = output.view(*out_orig_shape)
-        return output
+        return _mcp_apply(x, bias, self)
+    
 
     @classmethod
     @_fully_sharded_can_replace
@@ -102,43 +120,6 @@ def can_replace_layer(
             decorate=False,
         )
 
-
-def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
-    """
-    MergedColumnParallelLinearWithShardedLoRA and
-    MergedQKVParallelLinearWithShardedLora share the same
-    LoRa weight application method.
-    
-    The main difference is the step by shard_size for lora_b which can
-    vary for MergedQKVParallelLinearWithShardedLora but is constant for
-    MergedColumnParallelLinearWithShardedLoRA.
-    """
-    # expecting 2 for column parallel and 3 for qkv
-    assert len(layer.lora_a_stacked) == layer.n_slices
-    output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
-
-    x = x.view(-1, x.shape[-1])
-    output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
-    buffers = torch.zeros(
-        (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
-        dtype=torch.float32,
-        device=x.device,
-    )
-    layer.punica_wrapper.add_shrink_packed_nslice(buffers, x,
-                                                  layer.lora_a_stacked, 1.0)
-    buffers = tensor_model_parallel_all_gather(buffers)
-    layer.punica_wrapper.add_expand_packed_nslice(output,
-                                                  buffers,
-                                                  layer.lora_b_stacked,
-                                                  layer.bias_stacked,
-                                                  layer.output_slices,
-                                                  add_input=True)
-
-    output = output.view(*out_orig_shape)
-    # now have column partitioned and packed output
-    return output
-
-
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
@@ -204,28 +185,7 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
     def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-
-        x = x.view(-1, x.shape[-1])
-        output, out_orig_shape = output.view(-1,
-                                             output.shape[-1]), output.shape
-        buffer = torch.zeros(
-            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
-            dtype=torch.float32,
-            device=x.device,
-        )
-        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
-                                                     self.lora_a_stacked, 1.0)
-        buffer = tensor_model_parallel_all_gather(buffer)
-        self.punica_wrapper.add_expand_packed_nslice(output,
-                                                     buffer,
-                                                     self.lora_b_stacked,
-                                                     self.bias_stacked,
-                                                     self.output_slices,
-                                                     add_input=True)
-        # now have column partitioned output
-        output = output.view(*out_orig_shape)
-        return output
+        return _mcp_apply(x, bias, self)
 
     @classmethod
     @_fully_sharded_can_replace

From 960bb3bd2c97d22d5f30f7dd1cd35f28c717e674 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 10:06:47 +0000
Subject: [PATCH 11/22] Optimize doc

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py |  18 ++-
 vllm/lora/layers.py               |   9 +-
 vllm/lora/punica.py               | 208 +++++++++++++++---------------
 3 files changed, 118 insertions(+), 117 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 74b6f34155988..038af5d6c3e40 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -31,26 +31,23 @@ def dec(*args, **kwargs):
 
     return dec
 
+
 def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
     """ 
     For `ColumnParallelLinearWithLoRA` or classes that inherit from 
     `ColumnParallelLinearWithLoRA`, they share the same `apply` logic.
     """
-    assert (
-        layer.n_slices
-        == len(layer.lora_a_stacked)
-        == len(layer.lora_b_stacked)
-        == len(layer.output_slices)
-    )
+    assert (layer.n_slices == len(layer.lora_a_stacked) == len(
+        layer.lora_b_stacked) == len(layer.output_slices))
     if layer.bias_stacked is not None:
-        assert layer.n_slices==len(layer.bias_stacked)
-        
+        assert layer.n_slices == len(layer.bias_stacked)
+
     output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
 
     x = x.view(-1, x.shape[-1])
     output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
 
-    # Since communication is needed, the buffer is directly initialized as a 
+    # Since communication is needed, the buffer is directly initialized as a
     # tensor rather than a tuple of tensor.
     buffers = torch.zeros(
         (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]),
@@ -72,6 +69,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
     # now have column partitioned and packed output
     return output
 
+
 # these layers are based on the tensor parallelism strategy given in
 # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
 # https://arxiv.org/abs/2311.03285.
@@ -100,7 +98,6 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         return _mcp_apply(x, bias, self)
-    
 
     @classmethod
     @_fully_sharded_can_replace
@@ -120,6 +117,7 @@ def can_replace_layer(
             decorate=False,
         )
 
+
 class MergedColumnParallelLinearWithShardedLoRA(
         MergedColumnParallelLinearWithLoRA):
     """
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 1c63e300b7838..323cc4fbde604 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -376,11 +376,10 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
-        self.punica_wrapper.add_lora_packed_nslice(output, x,
-                                                   self.lora_a_stacked,
-                                                   self.lora_b_stacked,
-                                                   self.bias_stacked, 1.0,
-                                                   self.output_slices)
+        self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
+                                            self.lora_b_stacked,
+                                            self.bias_stacked, 1.0,
+                                            self.output_slices)
         return output
 
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 4b06c05a3e828..9465db16c8892 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -527,7 +527,7 @@ def apply_bias_packed_nslice(
 
         return output.view_as(org_output)
 
-    def add_shrink(
+    def apply_shrink(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -549,27 +549,7 @@ def add_shrink(
         shrink_fun(y, x, w_t_all, scale)
         y = y.view_as(y_org)
 
-    def add_shrink_packed_nslice(
-        self,
-        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
-        x: torch.Tensor,
-        lora_a_stacked: Tuple[torch.Tensor, ...],
-        scale: float,
-    ):
-        """
-        Perform the ` y[i]+=x@w_t_all[i]` computation, which is suitable for 
-        the GEMM of lora'a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the shrink_decode function
-        should be called.
-        """
-        x = x.view(-1, x.shape[-1])
-        # TODO fuse these kernels
-        for slice_idx in range(len(lora_a_stacked)):
-            self.add_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale)
-
-    def add_expand(
+    def apply_expand(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -611,6 +591,37 @@ def add_expand_slice(self,
                                       self.expand_slice_decode)
         expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
 
+    def add_shrink_packed_nslice(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+    ):
+        """
+        Performs GEMM  for multiple slices of lora_a.
+        When `is_prefill is` true, it indicates that it is currently the
+        prefill stage, and the `shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the shrink_decode function
+        should be called.
+            
+        Semantics:
+        for i in range(len(lora_a_stacked)):
+            y[i] += (x @ lora_a_stacked[i]) * scale
+        
+        Args:
+            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
+            scale (float): Scaling factor for the operation
+    """
+
+        x = x.view(-1, x.shape[-1])
+        # TODO fuse these kernels
+        for slice_idx in range(len(lora_a_stacked)):
+            self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                              scale)
+
     def add_expand_packed_nslice(
         self,
         y: torch.Tensor,
@@ -621,8 +632,23 @@ def add_expand_packed_nslice(
         add_input=True,
     ) -> None:
         """
-        Similar to `add_expand`
-        """
+        Performs GEMM and bias addition for multiple slices of lora_b.
+      
+        Semantics:
+            for i in range(len(lora_b_stacked)):
+                slice = output_slices[i]
+                y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
+                    bias_stacked[i] 
+                offset += slice
+            
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
+            bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight
+            output_slices (Tuple[int, ...]): Every slice's size
+            add_input (bool):  Defaults to True.
+            """
         y_org = y
         y = y.view(-1, y.shape[-1])
         offset_left = 0
@@ -649,12 +675,17 @@ def add_lora_embedding(
         add_input: bool = True,
     ):
         """
-        Perform the ` y+=x@w_t_all` computation, which is suitable for the
-        GEMM of lora'b or embedding layer's lora.
-        When `is_prefill` is true, it indicates that it is currently the
-        prefill stage, and the `expand_prefill` function should be called.
-        Otherwise, it is the decode stage, and the expand_decode function
-        should be called.
+        Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
+
+        Semantics:
+            y += x @ w_t_all
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            w_t_all (torch.Tensor): Transposed weight matrix for all LoRAs.
+            add_input (bool): Default to True.
+   
         """
 
         # Embedding layer only need expand op
@@ -662,67 +693,7 @@ def add_lora_embedding(
                                 if self.is_prefill else self.expand_decode)
         expand_fun(y, x, w_t_all, add_input)
 
-    def add_lora(self,
-                 y: torch.Tensor,
-                 x: torch.Tensor,
-                 wa_t_all: torch.Tensor,
-                 wb_t_all: torch.Tensor,
-                 bias_stacked: Optional[torch.Tensor],
-                 scale: float,
-                 y_offset: Optional[int] = None,
-                 y_slice_size: Optional[int] = None,
-                 *,
-                 buffer: Optional[torch.Tensor] = None) -> None:
-        """
-        Semantics:
-        y[i] += (
-            x[i].unsqueeze(0)
-            @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2)
-            * scale
-            ).squeeze(0)+bias[i]
-        Args:
-            y (torch.Tensor):  Output tensor. Will be changed in-place.
-            x (torch.Tensor): Input tensor
-            wa_t_all (torch.Tensor): lora_a's weight
-            wb_t_all (torch.Tensor): lora_b's weight
-            bias_stacked: (torch.Tensor): lora's bias
-            scale (float): Scaling factor.
-            y_offset (Optional[int], optional): Offset to apply to the starting
-                column of y.
-            y_slice_size (Optional[int], optional): Size of the y column slice.
-            buffer (Optional[torch.Tensor], optional): Defaults to None.
-        """
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
-        if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
-            # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros((x.size(0), r),
-                                 dtype=torch.float32,
-                                 device=x.device)
-        if bias_stacked is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
-        self.add_shrink(buffer, x, wa_t_all, scale)
-        if y_offset is None and y_slice_size is None:
-            self.add_expand(y,
-                            buffer,
-                            wb_t_all,
-                            bias_stacked=None,
-                            add_input=True)
-        else:
-            self.apply_expand_slice(y,
-                                    buffer,
-                                    wb_t_all,
-                                    None,
-                                    y_offset,
-                                    y_slice_size,
-                                    add_input=True)
-        y = y.view_as(y_org)
-
-    def add_lora_packed_nslice(
+    def add_lora_linear(
             self,
             y: torch.Tensor,
             x: torch.Tensor,
@@ -734,9 +705,26 @@ def add_lora_packed_nslice(
             *,
             buffer: Optional[Tuple[torch.Tensor, ...]] = None) -> None:
         """
-        Applies lora to each input. Similar to add_lora, This method is 
-        used for layers that are composed of multiple sublayers
-        (slices) packed together.
+        Applicable to linear-related lora. 
+
+        Semantics:
+            for i in range(len(lora_a_stacked)):
+                y[i] += (
+                    x[i].unsqueeze(0)
+                    @ lora_a_stacked[indices[i], layer_idx, :, :]
+                    @ lora_b_stacked[indices[i], layer_idx, :, :]
+                    * scale
+                    ).squeeze(0)+bias_stacked[i]
+
+        Args:
+            y (torch.Tensor): Output tensor. Will be changed in-place.
+            x (torch.Tensor): Input tensor
+            lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
+            lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
+            bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            scale (float): Scaling factor.
+            output_slices (Tuple[int, ...]): Every slice's size.
+            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
@@ -764,25 +752,41 @@ def add_lora_packed_nslice(
     def add_lora_logits(self,
                         y: torch.Tensor,
                         x: torch.Tensor,
-                        wa_t_all: torch.Tensor,
-                        wb_t_all: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
                         scale,
                         *,
                         buffer: Optional[torch.Tensor] = None) -> None:
         """
-        LogitsProcessorWithLoRA always using bgmv
-        """
+        Applies lora  specifically for LogitsProcessorWithLoRA.
+        
+        Semantics:
+            buffer = (x @ lora_a_stacked) * scale
+            y += buffer @ lora_b_stacked
+
+        Args:
+            y (torch.Tensor): Output tensor.
+            x (torch.Tensor): Input tensor.
+            lora_a_stacked (torch.Tensor): lora_a's weights.
+            lora_b_stacked (torch.Tensor):lora_b's weights.
+            scale (float): Scaling factor.
+            buffer (Optional[torch.Tensor]):Default to None.
+            """
         y_org = y
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
-        r = wb_t_all.size(-1)
+        r = lora_b_stacked.size(-1)
         if buffer is None:
             # We set the buffer to be float32 by default ,refer to:
             # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-
-        bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale)
-        bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True)
+        # LogitsProcessorWithLoRA always using bgmv.
+        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+        bgmv_expand(buffer,
+                    lora_b_stacked,
+                    y,
+                    self.sampler_indices,
+                    add_inputs=True)
         y = y.view_as(y_org)

From 61a8085941eea5f0598f4ee350e1800158254744 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 10:53:27 +0000
Subject: [PATCH 12/22] Optimize interface

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py |  6 ++----
 vllm/lora/punica.py               | 34 ++++++++-----------------------
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 038af5d6c3e40..365d1c57c404b 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -302,12 +302,10 @@ def apply(self,
 
         # TODO:add DOC
         buffer = buffer.squeeze(dim=0)
-        shard_size = self.lora_b_stacked[0].shape[2]
-        start_idx = self.tp_rank * shard_size
-        self.punica_wrapper.add_expand_slice(
+        self.punica_wrapper.add_expand_fs_rowlinear(
             output, buffer, self.lora_b_stacked[0],
             self.bias_stacked[0] if self.bias_stacked is not None else None,
-            start_idx, shard_size)
+            add_input=True)
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 9465db16c8892..8bc6500dd0cbe 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -549,11 +549,11 @@ def apply_shrink(
         shrink_fun(y, x, w_t_all, scale)
         y = y.view_as(y_org)
 
-    def apply_expand(
+    def add_expand_fs_rowlinear(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
         bias_stacked: Optional[torch.Tensor],
         add_input: bool = True,
     ):
@@ -570,26 +570,9 @@ def apply_expand(
 
         expand_fun: Callable = (self.expand_prefill
                                 if self.is_prefill else self.expand_decode)
-        expand_fun(y, x, w_t_all, add_input)
-
-    def add_expand_slice(self,
-                         y: torch.Tensor,
-                         x: torch.Tensor,
-                         w_t_all: torch.Tensor,
-                         bias_stacked: Optional[torch.Tensor],
-                         y_offset: Optional[int],
-                         y_slice_size: Optional[int],
-                         add_input: bool = True):
-        """
-        Similar to `add_expand`
-        """
-        if bias_stacked is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
+        expand_fun(y, x, lora_b_stacked, add_input)
+
 
-        expand_slice_fun: Callable = (self.expand_slice_prefill
-                                      if self.is_prefill else
-                                      self.expand_slice_decode)
-        expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
 
     def add_shrink_packed_nslice(
         self,
@@ -664,26 +647,25 @@ def add_expand_packed_nslice(
                                     output_slices[slice_idx],
                                     add_input=add_input)
             offset_left += output_slices[slice_idx]
-
         y = y.view_as(y_org)
 
     def add_lora_embedding(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        lora_b_stacked: torch.Tensor,
         add_input: bool = True,
     ):
         """
         Applies lora  specifically for VocabParallelEmbeddingWithLoRA.
 
         Semantics:
-            y += x @ w_t_all
+            y += x @ lora_b_stacked
 
         Args:
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
-            w_t_all (torch.Tensor): Transposed weight matrix for all LoRAs.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
             add_input (bool): Default to True.
    
         """
@@ -691,7 +673,7 @@ def add_lora_embedding(
         # Embedding layer only need expand op
         expand_fun: Callable = (self.expand_prefill
                                 if self.is_prefill else self.expand_decode)
-        expand_fun(y, x, w_t_all, add_input)
+        expand_fun(y, x, lora_b_stacked, add_input)
 
     def add_lora_linear(
             self,

From 4ab1c33ce4b90e01603a16da6aa74f58d8f2e7be Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 15:48:15 +0000
Subject: [PATCH 13/22] Add unit test

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py         | 27 ++++++++++++++++++++-------
 vllm/lora/fully_sharded_layers.py | 23 +++++++++++------------
 vllm/lora/layers.py               |  3 ++-
 vllm/lora/punica.py               | 20 +++++++++-----------
 4 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 15e576cb065c7..c1de857821e23 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -669,8 +669,9 @@ def create_random_linear_replicated_layer():
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage) -> None:
+                         device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
@@ -679,7 +680,8 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -700,7 +702,12 @@ def create_random_linear_parallel_layer():
                            if not fully_shard else
                            ColumnParallelLinearWithShardedLoRA(linear))
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked))
+        if bias_enabled:
+            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -784,8 +791,9 @@ def create_random_linear_parallel_layer():
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
+@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage) -> None:
+                                device, stage, bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
@@ -794,7 +802,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -835,7 +844,12 @@ class FakeConfig:
         lora_linear.create_lora_weights(max_loras,
                                         lora_config,
                                         model_config=FakeConfig())
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked))
+        if bias_enabled:
+            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -911,7 +925,6 @@ class FakeConfig:
             512,
             lora_config.lora_extra_vocab_size,
         )
-        # lora_linear.set_mapping(*mapping_info)
 
         lora_result = lora_linear(torch.cat(inputs))[0]
         expected_result = linear(torch.cat(inputs))[0]
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 365d1c57c404b..0d13914d99136 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -55,15 +55,14 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
         device=x.device,
     )
 
-    layer.punica_wrapper.add_shrink_packed_nslice(buffers, x,
-                                                  layer.lora_a_stacked, 1.0)
+    layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0)
     buffers = tensor_model_parallel_all_gather(buffers)
-    layer.punica_wrapper.add_expand_packed_nslice(output,
-                                                  buffers,
-                                                  layer.lora_b_stacked,
-                                                  layer.bias_stacked,
-                                                  layer.output_slices,
-                                                  add_input=True)
+    layer.punica_wrapper.add_expand(output,
+                                    buffers,
+                                    layer.lora_b_stacked,
+                                    layer.bias_stacked,
+                                    layer.output_slices,
+                                    add_input=True)
 
     output = output.view(*out_orig_shape)
     # now have column partitioned and packed output
@@ -289,8 +288,7 @@ def apply(self,
             device=x.device,
         )
 
-        self.punica_wrapper.add_shrink_packed_nslice(buffer, x,
-                                                     self.lora_a_stacked, 1.0)
+        self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0)
         buffer = tensor_model_parallel_all_reduce(buffer)
 
         # following S-LoRA, allows the fusing of all_gather and all_reduce
@@ -300,10 +298,11 @@ def apply(self,
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
 
-        # TODO:add DOC
         buffer = buffer.squeeze(dim=0)
         self.punica_wrapper.add_expand_fs_rowlinear(
-            output, buffer, self.lora_b_stacked[0],
+            output,
+            buffer,
+            self.lora_b_stacked[0],
             self.bias_stacked[0] if self.bias_stacked is not None else None,
             add_input=True)
         output = output.view(*out_orig_shape)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 323cc4fbde604..c1f03675ff4f0 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -278,6 +278,8 @@ def __init__(self, base_layer: LinearBase):
 
         self.output_slices: Tuple[int, ...]
         self.tp_size: int
+        self.output_size: int
+        self.n_slices: int
 
     def create_lora_weights(
         self,
@@ -389,7 +391,6 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
         super().__init__(base_layer, )
         # To ensure interface compatibility, set to 1 always.
         self.tp_size = 1
-
         self.output_size = self.base_layer.output_size
         self.n_slices = 1
 
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 8bc6500dd0cbe..fc79e957c6cc6 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -572,9 +572,7 @@ def add_expand_fs_rowlinear(
                                 if self.is_prefill else self.expand_decode)
         expand_fun(y, x, lora_b_stacked, add_input)
 
-
-
-    def add_shrink_packed_nslice(
+    def add_shrink(
         self,
         y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
         x: torch.Tensor,
@@ -605,7 +603,7 @@ def add_shrink_packed_nslice(
             self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
                               scale)
 
-    def add_expand_packed_nslice(
+    def add_expand(
         self,
         y: torch.Tensor,
         x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
@@ -723,13 +721,13 @@ def add_lora_linear(
                 torch.zeros(
                     (x.size(0), r), dtype=torch.float32, device=x.device)
                 for _ in range(len(output_slices)))
-        self.add_shrink_packed_nslice(buffer, x, lora_a_stacked, scale)
-        self.add_expand_packed_nslice(y,
-                                      buffer,
-                                      lora_b_stacked,
-                                      None,
-                                      output_slices,
-                                      add_input=True)
+        self.add_shrink(buffer, x, lora_a_stacked, scale)
+        self.add_expand(y,
+                        buffer,
+                        lora_b_stacked,
+                        None,
+                        output_slices,
+                        add_input=True)
 
     def add_lora_logits(self,
                         y: torch.Tensor,

From 3c2192cb966cc4fd3ea7580edab84458736c94d7 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 4 Dec 2024 16:30:44 +0000
Subject: [PATCH 14/22] Done

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py         | 14 ++++-
 vllm/lora/fully_sharded_layers.py | 17 ++++--
 vllm/lora/layers.py               |  5 +-
 vllm/lora/punica.py               | 92 ++++++++-----------------------
 4 files changed, 47 insertions(+), 81 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index c1de857821e23..4e4988b07564a 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -565,7 +565,9 @@ def _pretest():
 @pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
+@pytest.mark.parametrize("bias_enabled", [True, False])
+def test_linear_replicated(dist_init, num_loras, device, stage,
+                           bias_enabled) -> None:
 
     torch.cuda.set_device(device)
     torch.set_default_device(device)
@@ -573,7 +575,8 @@ def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
-                             lora_dtype=torch.float16)
+                             lora_dtype=torch.float16,
+                             bias_enabled=bias_enabled)
 
     def create_random_linear_replicated_layer():
 
@@ -585,7 +588,12 @@ def create_random_linear_replicated_layer():
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
         lora_linear.create_lora_weights(max_loras, lora_config)
-
+        assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
+            lora_linear.lora_b_stacked))
+        if bias_enabled:
+            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+        else:
+            assert lora_linear.bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 0d13914d99136..a0e2dd3d8e5cf 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -62,6 +62,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
                                     layer.lora_b_stacked,
                                     layer.bias_stacked,
                                     layer.output_slices,
+                                    offset_start=0,
                                     add_input=True)
 
     output = output.view(*out_orig_shape)
@@ -297,14 +298,18 @@ def apply(self,
         # remains is a standard all_reduce. User should be aware though that
         # the output is not the same as a normal row_parallel, it should be
         # reduced before being used
-
-        buffer = buffer.squeeze(dim=0)
-        self.punica_wrapper.add_expand_fs_rowlinear(
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        self.punica_wrapper.add_expand(
             output,
             buffer,
-            self.lora_b_stacked[0],
-            self.bias_stacked[0] if self.bias_stacked is not None else None,
-            add_input=True)
+            self.lora_b_stacked,
+            self.bias_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
         output = output.view(*out_orig_shape)
         return output
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index c1f03675ff4f0..a6c2c5bd012cb 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -288,7 +288,7 @@ def create_lora_weights(
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
         self.lora_config = lora_config
-
+        #
         if isinstance(self.base_layer, ReplicatedLinear):
             lora_a_out_size = lora_config.max_lora_rank
             lora_b_out_size = self.output_size
@@ -307,7 +307,6 @@ def create_lora_weights(
         else:
             raise NotImplementedError
 
-        lora_bias_out_size = self.output_size
         self.lora_a_stacked = tuple(
             torch.zeros(
                 max_loras,
@@ -327,6 +326,7 @@ def create_lora_weights(
                 device=self.device,
             ) for _ in range(self.n_slices))
         if lora_config.bias_enabled:
+            lora_bias_out_size = lora_b_out_size
             self.bias_stacked = tuple(
                 torch.zeros(
                     max_loras,
@@ -342,6 +342,7 @@ def reset_lora(self, index: int):
             self.lora_a_stacked[s_index][index] = 0
             self.lora_b_stacked[s_index][index] = 0
             if self.lora_config.bias_enabled:
+                # Make mypy happy
                 self.bias_stacked = cast(Tuple[torch.Tensor, ...],
                                          self.bias_stacked)
                 self.bias_stacked[s_index][index] = 0
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index fc79e957c6cc6..1f503b763d614 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -450,21 +450,18 @@ def expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_input)
 
-    def apply_expand_slice(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           w_t_all: torch.Tensor,
-                           bias_stacked: Optional[torch.Tensor],
-                           y_offset: Optional[int],
-                           y_slice_size: Optional[int],
-                           add_input: bool = True):
+    def apply_expand(self,
+                     y: torch.Tensor,
+                     x: torch.Tensor,
+                     w_t_all: torch.Tensor,
+                     y_offset: Optional[int],
+                     y_slice_size: Optional[int],
+                     add_input: bool = True):
         """
-        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all+bias` 
+        Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
         computation, which is suitable for the
         GEMM of lora'b.
         """
-        if bias_stacked is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
 
         expand_slice_fun: Callable = (self.expand_slice_prefill
                                       if self.is_prefill else
@@ -472,30 +469,6 @@ def apply_expand_slice(self,
         expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
 
     def apply_bias(
-        self,
-        indices: torch.Tensor,
-        output: torch.Tensor,
-        bias_stacked: torch.Tensor,
-    ):
-        """Applies bias to output
-
-        Input shapes:
-            bias_stacked:    (num_loras, output_dim)
-            indices:         (batch_size)
-            output:          (batch_size, output_dim)
-        """
-        org_output = output
-        output = output.view(-1, output.shape[-1])
-        indices = indices.view(-1)
-
-        bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
-        bias_stacked = bias_stacked[indices]
-        bias_stacked[indices == -1] = 0
-        output += bias_stacked
-
-        return output.view_as(org_output)
-
-    def apply_bias_packed_nslice(
         self,
         indices: torch.Tensor,
         output: torch.Tensor,
@@ -549,29 +522,6 @@ def apply_shrink(
         shrink_fun(y, x, w_t_all, scale)
         y = y.view_as(y_org)
 
-    def add_expand_fs_rowlinear(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
-        bias_stacked: Optional[torch.Tensor],
-        add_input: bool = True,
-    ):
-        """
-        Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the
-        GEMM of lora'b.
-        When `is_prefill` is true, it indicates that it is currently the
-        prefill stage, and the `expand_prefill` function should be called.
-        Otherwise, it is the decode stage, and the expand_decode function
-        should be called.
-        """
-        if bias_stacked is not None:
-            y = self.apply_bias(self.token_lora_indices, y, bias_stacked)
-
-        expand_fun: Callable = (self.expand_prefill
-                                if self.is_prefill else self.expand_decode)
-        expand_fun(y, x, lora_b_stacked, add_input)
-
     def add_shrink(
         self,
         y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
@@ -610,6 +560,7 @@ def add_expand(
         lora_b_stacked: Tuple[torch.Tensor, ...],
         bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
+        offset_start: int = 0,
         add_input=True,
     ) -> None:
         """
@@ -632,18 +583,19 @@ def add_expand(
             """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        offset_left = 0
+        offset_left = offset_start
         if bias_stacked is not None:
-            self.apply_bias_packed_nslice(self.token_lora_indices, y,
-                                          output_slices, bias_stacked)
+            self.apply_bias(self.token_lora_indices, y, output_slices,
+                            bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            self.apply_expand_slice(y,
-                                    x[slice_idx],
-                                    lora_b_stacked[slice_idx],
-                                    None,
-                                    offset_left,
-                                    output_slices[slice_idx],
-                                    add_input=add_input)
+            self.apply_expand(
+                y,
+                x[slice_idx],
+                lora_b_stacked[slice_idx],
+                offset_left,
+                output_slices[slice_idx],
+                add_input=add_input,
+            )
             offset_left += output_slices[slice_idx]
         y = y.view_as(y_org)
 
@@ -710,8 +662,8 @@ def add_lora_linear(
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
         if bias_stacked is not None:
             assert len(bias_stacked) == len(output_slices)
-            y = self.apply_bias_packed_nslice(self.token_lora_indices, y,
-                                              output_slices, bias_stacked)
+            y = self.apply_bias(self.token_lora_indices, y, output_slices,
+                                bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)

From bb60e25733674a0e59d3cbcd31bb98ca81ef8dc2 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 03:51:09 +0000
Subject: [PATCH 15/22] Optimize function name

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py         |  12 ++--
 vllm/lora/fully_sharded_layers.py |  13 ++--
 vllm/lora/layers.py               | 113 +++++++++++++++++-------------
 vllm/lora/punica.py               |  29 ++++----
 4 files changed, 94 insertions(+), 73 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 4e4988b07564a..06192e9678ba7 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -591,9 +591,9 @@ def create_random_linear_replicated_layer():
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked))
         if bias_enabled:
-            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
-            assert lora_linear.bias_stacked is None
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -713,9 +713,9 @@ def create_random_linear_parallel_layer():
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked))
         if bias_enabled:
-            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
-            assert lora_linear.bias_stacked is None
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
@@ -855,9 +855,9 @@ class FakeConfig:
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked))
         if bias_enabled:
-            assert len(lora_linear.bias_stacked) == lora_linear.n_slices
+            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
-            assert lora_linear.bias_stacked is None
+            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(10):
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index a0e2dd3d8e5cf..545ec21ca74c1 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -39,8 +39,8 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
     """
     assert (layer.n_slices == len(layer.lora_a_stacked) == len(
         layer.lora_b_stacked) == len(layer.output_slices))
-    if layer.bias_stacked is not None:
-        assert layer.n_slices == len(layer.bias_stacked)
+    if layer.lora_bias_stacked is not None:
+        assert layer.n_slices == len(layer.lora_bias_stacked)
 
     output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias)
 
@@ -60,7 +60,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA):
     layer.punica_wrapper.add_expand(output,
                                     buffers,
                                     layer.lora_b_stacked,
-                                    layer.bias_stacked,
+                                    layer.lora_bias_stacked,
                                     layer.output_slices,
                                     offset_start=0,
                                     add_input=True)
@@ -268,8 +268,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
     def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
         if bias is None:
             return bias
-        self.bias_stacked = cast(Tuple[torch.Tensor, ...], self.bias_stacked)
-        shard_size = self.bias_stacked[0].shape[2]
+        self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                      self.lora_bias_stacked)
+        shard_size = self.lora_bias_stacked[0].shape[2]
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         bias = bias[start_idx:end_idx]
@@ -305,7 +306,7 @@ def apply(self,
             output,
             buffer,
             self.lora_b_stacked,
-            self.bias_stacked,
+            self.lora_bias_stacked,
             self.output_slices,
             offset_start=offset_start,
             add_input=True,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a6c2c5bd012cb..3ff96832eb5a5 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -274,7 +274,7 @@ def __init__(self, base_layer: LinearBase):
         self.base_layer = base_layer
         self.input_size = self.base_layer.input_size
         self.device = _get_lora_device(self.base_layer)
-        self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
+        self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None
 
         self.output_slices: Tuple[int, ...]
         self.tp_size: int
@@ -327,7 +327,7 @@ def create_lora_weights(
             ) for _ in range(self.n_slices))
         if lora_config.bias_enabled:
             lora_bias_out_size = lora_b_out_size
-            self.bias_stacked = tuple(
+            self.lora_bias_stacked = tuple(
                 torch.zeros(
                     max_loras,
                     1,
@@ -343,9 +343,9 @@ def reset_lora(self, index: int):
             self.lora_b_stacked[s_index][index] = 0
             if self.lora_config.bias_enabled:
                 # Make mypy happy
-                self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-                                         self.bias_stacked)
-                self.bias_stacked[s_index][index] = 0
+                self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                              self.lora_bias_stacked)
+                self.lora_bias_stacked[s_index][index] = 0
 
     def set_lora(
         self,
@@ -353,14 +353,20 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
+        # Except for QKVParallelLinearWithLora and
+        # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
+        # have a tuple size of 1. These two layers will override this function.
+        assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
+                self.n_slices == 1)
+
         self.reset_lora(index)
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
 
         self.lora_a_stacked[0][index,
                                0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -368,12 +374,13 @@ def set_lora(
         self.lora_b_stacked[0][index,
                                0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                    lora_b.T, non_blocking=True)
-        if bias is not None:
-            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-                                     self.bias_stacked)
-            self.bias_stacked[0][index,
-                                 0, :bias.shape[0]].copy_(bias.T,
-                                                          non_blocking=True)
+        if lora_bias is not None:
+
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            assert len(self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_(
+                lora_bias.T, non_blocking=True)
 
     def apply(self,
               x: torch.Tensor,
@@ -381,7 +388,7 @@ def apply(self,
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
         self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
                                             self.lora_b_stacked,
-                                            self.bias_stacked, 1.0,
+                                            self.lora_bias_stacked, 1.0,
                                             self.output_slices)
         return output
 
@@ -543,6 +550,10 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
+        """
+        The main reason for overriding this function is to enhance  code 
+        maintainability.
+        """
         self.lora_config = lora_config
 
         if not (len(self.base_layer.output_sizes) == self.n_slices
@@ -577,7 +588,7 @@ def create_lora_weights(
                 device=self.device,
             ) for _ in range(self.n_slices))
         if lora_config.bias_enabled:
-            self.bias_stacked = tuple(
+            self.lora_bias_stacked = tuple(
                 torch.zeros(
                     max_loras,
                     1,
@@ -625,15 +636,15 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
 
         if lora_a[0] is not None:
             self.lora_a_stacked[0][
@@ -642,12 +653,11 @@ def set_lora(
             self.lora_b_stacked[0][
                 index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
                     lora_b[0].T, non_blocking=True)
-        if bias is not None and bias[0] is not None:
-            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-                                     self.bias_stacked)
-            self.bias_stacked[0][index,
-                                 0, :bias[0].shape[0]].copy_(bias[0].T,
-                                                             non_blocking=True)
+        if lora_bias is not None and lora_bias[0] is not None:
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            self.lora_bias_stacked[0][index, 0, :lora_bias[0].shape[0]].copy_(
+                lora_bias[0].T, non_blocking=True)
         if lora_a[1] is not None:
             self.lora_a_stacked[1][
                 index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
@@ -655,12 +665,11 @@ def set_lora(
             self.lora_b_stacked[1][
                 index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
                     lora_b[1].T, non_blocking=True)
-        if bias is not None and bias[1] is not None:
-            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-                                     self.bias_stacked)
-            self.bias_stacked[1][index,
-                                 0, :bias[1].shape[0]].copy_(bias[1].T,
-                                                             non_blocking=True)
+        if lora_bias is not None and lora_bias[1] is not None:
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            self.lora_bias_stacked[1][index, 0, :lora_bias[1].shape[0]].copy_(
+                lora_bias[1].T, non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -765,6 +774,10 @@ def create_lora_weights(
         lora_config: LoRAConfig,
         model_config: Optional[PretrainedConfig] = None,
     ) -> None:
+        """
+        The main reason for overloading this function is to handle inconsistent 
+        weight dimensions in qkv lora.
+        """
         self.lora_config = lora_config
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -832,7 +845,7 @@ def create_lora_weights(
             ),
         )
         if lora_config.bias_enabled:
-            self.bias_stacked = (
+            self.lora_bias_stacked = (
                 torch.zeros(
                     max_loras,
                     1,
@@ -915,15 +928,15 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
-        bias: Optional[torch.Tensor] = None,
+        lora_bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
-            if bias is not None:
-                bias = self.slice_bias(bias)
+            if lora_bias is not None:
+                lora_bias = self.slice_bias(lora_bias)
 
         if lora_b[0] is not None:
             lora_b_q = lora_b[0]
@@ -954,18 +967,24 @@ def set_lora(
                 index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
                     lora_a[2].T, non_blocking=True)
 
-        if bias is not None:
-            self.bias_stacked = cast(Tuple[torch.Tensor, ...],
-                                     self.bias_stacked)
-            if bias[0] is not None:
-                self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(
-                    bias[0].T, non_blocking=True)
-            if bias[1] is not None:
-                self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_(
-                    bias[1].T, non_blocking=True)
-            if bias[2] is not None:
-                self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
-                    bias[2].T, non_blocking=True)
+        if lora_bias is not None:
+            self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...],
+                                          self.lora_bias_stacked)
+            if lora_bias[0] is not None:
+                self.lora_bias_stacked[0][index,
+                                          0, :lora_bias[0].shape[0]].copy_(
+                                              lora_bias[0].T,
+                                              non_blocking=True)
+            if lora_bias[1] is not None:
+                self.lora_bias_stacked[1][index,
+                                          0, :lora_bias[1].shape[0]].copy_(
+                                              lora_bias[1].T,
+                                              non_blocking=True)
+            if lora_bias[2] is not None:
+                self.lora_bias_stacked[2][index,
+                                          0, :lora_bias[2].shape[0]].copy_(
+                                              lora_bias[2].T,
+                                              non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 1f503b763d614..257890a1853b9 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -473,12 +473,12 @@ def apply_bias(
         indices: torch.Tensor,
         output: torch.Tensor,
         output_slices: Tuple[int, ...],
-        bias_stacked: Tuple[Optional[torch.Tensor], ...],
+        lora_bias_stacked: Tuple[Optional[torch.Tensor], ...],
     ):
         """Applies bias to output
 
         Input shapes:
-            bias_stacked:      3 element tuple of (num_loras, output_dim)
+            lora_bias_stacked:      3 element tuple of (num_loras, output_dim)
             indices:           (batch_size)
             output:            (batch_size, q_slice_size + 2*kv_slice_size)
             output_slices:     n-1 element tuple of (slice_size...),
@@ -490,7 +490,7 @@ def apply_bias(
 
         offset_left = 0
         for slice_idx, slice in enumerate(output_slices):
-            bias = bias_stacked[slice_idx]
+            bias = lora_bias_stacked[slice_idx]
             if bias is not None:
                 bias = bias.view(-1, bias.shape[-1])
                 bias = bias[indices]
@@ -558,7 +558,7 @@ def add_expand(
         y: torch.Tensor,
         x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
         lora_b_stacked: Tuple[torch.Tensor, ...],
-        bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
         output_slices: Tuple[int, ...],
         offset_start: int = 0,
         add_input=True,
@@ -570,23 +570,24 @@ def add_expand(
             for i in range(len(lora_b_stacked)):
                 slice = output_slices[i]
                 y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + 
-                    bias_stacked[i] 
+                    lora_bias_stacked[i] 
                 offset += slice
             
         Args:
             y (torch.Tensor): Output tensor.
             x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
             lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
-            bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
+                bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
             add_input (bool):  Defaults to True.
             """
         y_org = y
         y = y.view(-1, y.shape[-1])
         offset_left = offset_start
-        if bias_stacked is not None:
+        if lora_bias_stacked is not None:
             self.apply_bias(self.token_lora_indices, y, output_slices,
-                            bias_stacked)
+                            lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
             self.apply_expand(
                 y,
@@ -631,7 +632,7 @@ def add_lora_linear(
             x: torch.Tensor,
             lora_a_stacked: Tuple[torch.Tensor, ...],
             lora_b_stacked: Tuple[torch.Tensor, ...],
-            bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+            lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
             scale: float,
             output_slices: Tuple[int, ...],
             *,
@@ -646,24 +647,24 @@ def add_lora_linear(
                     @ lora_a_stacked[indices[i], layer_idx, :, :]
                     @ lora_b_stacked[indices[i], layer_idx, :, :]
                     * scale
-                    ).squeeze(0)+bias_stacked[i]
+                    ).squeeze(0)+lora_bias_stacked[i]
 
         Args:
             y (torch.Tensor): Output tensor. Will be changed in-place.
             x (torch.Tensor): Input tensor
             lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight.
             lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight.
-            bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
+            lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (Tuple[int, ...]): Every slice's size.
             buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
-        if bias_stacked is not None:
-            assert len(bias_stacked) == len(output_slices)
+        if lora_bias_stacked is not None:
+            assert len(lora_bias_stacked) == len(output_slices)
             y = self.apply_bias(self.token_lora_indices, y, output_slices,
-                                bias_stacked)
+                                lora_bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)

From b61da95a154e52cdf98b364b9bdbf34c3a76790c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 05:24:57 +0000
Subject: [PATCH 16/22] Optimize doc

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 3ff96832eb5a5..ed751bf2635d3 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -357,7 +357,8 @@ def set_lora(
     ):
         # Except for QKVParallelLinearWithLora and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
-        # have a tuple size of 1. These two layers will override this function.
+        # store weights in a tuple of size 1. These two layers will
+        # override this function.
         assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) ==
                 self.n_slices == 1)
 

From 0a6b01c34abafdc139e2655f751f1a827cc4daa8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 06:30:10 +0000
Subject: [PATCH 17/22] format code

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index ed751bf2635d3..a466e3a732aa7 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -18,12 +18,9 @@
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
 from vllm.lora.punica import PunicaWrapper
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear, LinearBase, MergedColumnParallelLinear,
+    QKVParallelLinear, ReplicatedLinear, RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)

From e440859f40551bcf7fee1413e52684370efefd23 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 06:38:10 +0000
Subject: [PATCH 18/22] format code

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index a466e3a732aa7..513af27973df9 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -18,9 +18,14 @@
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
 from vllm.lora.punica import PunicaWrapper
-from vllm.model_executor.layers.linear import (
-    ColumnParallelLinear, LinearBase, MergedColumnParallelLinear,
-    QKVParallelLinear, ReplicatedLinear, RowParallelLinear)
+# yapf: enable
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+# yapf: enable                                             
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)

From 7b01f48bab447cdff402944f731e7a3b1aa84df6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 06:41:38 +0000
Subject: [PATCH 19/22] fix typo

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 513af27973df9..c32d69162dd75 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -18,7 +18,7 @@
                               tensor_model_parallel_gather)
 from vllm.distributed.utils import divide
 from vllm.lora.punica import PunicaWrapper
-# yapf: enable
+# yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
                                                MergedColumnParallelLinear,

From da2256df0f3802b5878f41998a24c7954505bc72 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 06:48:52 +0000
Subject: [PATCH 20/22] fix typo

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index c32d69162dd75..ec179eca5f91e 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -25,7 +25,7 @@
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
-# yapf: enable                                             
+# yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding, RotaryEmbedding)

From a265f7a8fa17aeb8e90a6318714c33f602c7f7ec Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 08:00:46 +0000
Subject: [PATCH 21/22] Modify function name

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/punica.py | 64 ++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 257890a1853b9..563d1181d6fcb 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -362,7 +362,7 @@ def long_lora_indices(self) -> torch.Tensor:
         long_lora_len = self.indices_len[4]
         return self._long_lora_indices[:long_lora_len]
 
-    def shrink_prefill(
+    def _shrink_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -380,7 +380,7 @@ def shrink_prefill(
             scale,
         )
 
-    def shrink_decode(
+    def _shrink_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -389,7 +389,7 @@ def shrink_decode(
     ):
         bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
 
-    def expand_prefill(
+    def _expand_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -407,7 +407,7 @@ def expand_prefill(
             add_input,
         )
 
-    def expand_decode(
+    def _expand_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -416,7 +416,7 @@ def expand_decode(
     ):
         bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input)
 
-    def expand_slice_prefill(
+    def _expand_slice_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -438,7 +438,7 @@ def expand_slice_prefill(
             add_input,
         )
 
-    def expand_slice_decode(
+    def _expand_slice_decode(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -450,25 +450,25 @@ def expand_slice_decode(
         bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
                           y_slice_size, add_input)
 
-    def apply_expand(self,
-                     y: torch.Tensor,
-                     x: torch.Tensor,
-                     w_t_all: torch.Tensor,
-                     y_offset: Optional[int],
-                     y_slice_size: Optional[int],
-                     add_input: bool = True):
+    def _apply_expand(self,
+                      y: torch.Tensor,
+                      x: torch.Tensor,
+                      w_t_all: torch.Tensor,
+                      y_offset: Optional[int],
+                      y_slice_size: Optional[int],
+                      add_input: bool = True):
         """
         Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` 
         computation, which is suitable for the
         GEMM of lora'b.
         """
 
-        expand_slice_fun: Callable = (self.expand_slice_prefill
+        expand_slice_fun: Callable = (self._expand_slice_prefill
                                       if self.is_prefill else
-                                      self.expand_slice_decode)
+                                      self._expand_slice_decode)
         expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input)
 
-    def apply_bias(
+    def _apply_bias(
         self,
         indices: torch.Tensor,
         output: torch.Tensor,
@@ -500,7 +500,7 @@ def apply_bias(
 
         return output.view_as(org_output)
 
-    def apply_shrink(
+    def _apply_shrink(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
@@ -511,14 +511,14 @@ def apply_shrink(
         Perform the ` y+=x@w_t_all` computation, which is suitable for the
         GEMM of lora'a.
         When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the shrink_decode function
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
         should be called.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
-        shrink_fun: Callable = (self.shrink_prefill
-                                if self.is_prefill else self.shrink_decode)
+        shrink_fun: Callable = (self._shrink_prefill
+                                if self.is_prefill else self._shrink_decode)
         shrink_fun(y, x, w_t_all, scale)
         y = y.view_as(y_org)
 
@@ -532,8 +532,8 @@ def add_shrink(
         """
         Performs GEMM  for multiple slices of lora_a.
         When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the shrink_decode function
+        prefill stage, and the `_shrink_prefill` function should be called.
+        Otherwise, it is the decode stage, and the _shrink_decode function
         should be called.
             
         Semantics:
@@ -550,8 +550,8 @@ def add_shrink(
         x = x.view(-1, x.shape[-1])
         # TODO fuse these kernels
         for slice_idx in range(len(lora_a_stacked)):
-            self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
-                              scale)
+            self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx],
+                               scale)
 
     def add_expand(
         self,
@@ -586,10 +586,10 @@ def add_expand(
         y = y.view(-1, y.shape[-1])
         offset_left = offset_start
         if lora_bias_stacked is not None:
-            self.apply_bias(self.token_lora_indices, y, output_slices,
-                            lora_bias_stacked)
+            self._apply_bias(self.token_lora_indices, y, output_slices,
+                             lora_bias_stacked)
         for slice_idx in range(len(lora_b_stacked)):
-            self.apply_expand(
+            self._apply_expand(
                 y,
                 x[slice_idx],
                 lora_b_stacked[slice_idx],
@@ -622,8 +622,8 @@ def add_lora_embedding(
         """
 
         # Embedding layer only need expand op
-        expand_fun: Callable = (self.expand_prefill
-                                if self.is_prefill else self.expand_decode)
+        expand_fun: Callable = (self._expand_prefill
+                                if self.is_prefill else self._expand_decode)
         expand_fun(y, x, lora_b_stacked, add_input)
 
     def add_lora_linear(
@@ -663,8 +663,8 @@ def add_lora_linear(
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
         if lora_bias_stacked is not None:
             assert len(lora_bias_stacked) == len(output_slices)
-            y = self.apply_bias(self.token_lora_indices, y, output_slices,
-                                lora_bias_stacked)
+            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+                                 lora_bias_stacked)
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)

From 2f02dda5aa1b6156f3eeedf01a4246a8225631a6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 5 Dec 2024 09:03:29 +0000
Subject: [PATCH 22/22] Modify nslices

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py |  7 ++++---
 vllm/lora/layers.py       | 15 ++++++++++-----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 06192e9678ba7..a113e3f7abc1e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -589,7 +589,7 @@ def create_random_linear_replicated_layer():
 
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked))
+            lora_linear.lora_b_stacked) == 1)
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
@@ -711,7 +711,7 @@ def create_random_linear_parallel_layer():
                            ColumnParallelLinearWithShardedLoRA(linear))
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked))
+            lora_linear.lora_b_stacked) == 1)
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
@@ -849,11 +849,12 @@ class FakeConfig:
             num_key_value_heads = 32
             num_attention_heads = 32
 
+        n_slices = repeats
         lora_linear.create_lora_weights(max_loras,
                                         lora_config,
                                         model_config=FakeConfig())
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
-            lora_linear.lora_b_stacked))
+            lora_linear.lora_b_stacked) == n_slices)
         if bias_enabled:
             assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
         else:
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index ec179eca5f91e..473e4bedf3d60 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -545,7 +545,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def __init__(self, base_layer: MergedColumnParallelLinear) -> None:
         super().__init__(base_layer)
         # There are two LoRA layers
-        self.n_slices = 2
+        self.n_slices = len(self.base_layer.output_sizes)
 
     def create_lora_weights(
         self,
@@ -559,7 +559,7 @@ def create_lora_weights(
         """
         self.lora_config = lora_config
 
-        if not (len(self.base_layer.output_sizes) == self.n_slices
+        if not (len(self.base_layer.output_sizes) == self.n_slices == 2
                 and self.base_layer.output_sizes[0]
                 == self.base_layer.output_sizes[1]):
             raise ValueError(
@@ -769,7 +769,9 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
     def __init__(self, base_layer: QKVParallelLinear) -> None:
         super().__init__(base_layer)
         # There are three LoRA layer.
-        self.n_slices = 3
+        self.n_slices = len(self.base_layer.output_sizes)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
 
     def create_lora_weights(
         self,
@@ -782,8 +784,11 @@ def create_lora_weights(
         weight dimensions in qkv lora.
         """
         self.lora_config = lora_config
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
+
+        if not (len(self.base_layer.output_sizes) == self.n_slices == 3):
+            raise ValueError(
+                "LoRAColumnParallelLinear3Slice requires 3 slices.")
+
         self.q_proj_shard_size = (self.base_layer.num_heads *
                                   self.base_layer.head_size)
         self.kv_proj_shard_size = (self.base_layer.num_kv_heads *