From f6fa2613390f5ec294609c674a1790f91d18fbda Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 30 Nov 2024 01:31:37 +0000 Subject: [PATCH 01/22] Init Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 105 ++++++-------------------------------------- vllm/lora/punica.py | 76 +++++++++++++++++++++++++++----- 2 files changed, 79 insertions(+), 102 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3701988ff692f..85a2402337131 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -67,63 +67,6 @@ def dec(*args, **kwargs): return dec -def apply_bias( - indices: torch.Tensor, - output: torch.Tensor, - bias_stacked: torch.Tensor, -): - """Applies bias to output - - Input shapes: - bias_stacked: (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) - bias_stacked = bias_stacked[indices] - bias_stacked[indices == -1] = 0 - output += bias_stacked - - return output.view_as(org_output) - - -def apply_bias_packed_nslice( - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], - bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], -): - """Applies bias to output - - Input shapes: - bias_stacked: 3 element tuple of (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), - where n is number of slices - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - offset_left = 0 - for slice_idx, slice in enumerate(output_slices): - bias = bias_stacked[slice_idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[indices] - bias[indices == -1] = 0 - output[:, offset_left:offset_left + slice] += bias - - offset_left += slice - - return output.view_as(org_output) - - @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -401,13 +344,10 @@ def apply(self, x: torch.Tensor, output = self.base_layer.quant_method.apply(self.base_layer, x, bias) if self.bias_stacked is not None: self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) + self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): @@ -578,13 +518,10 @@ def apply(self, x: torch.Tensor, output = self.base_layer.quant_method.apply(self.base_layer, x, bias) if self.bias_stacked is not None: self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) + self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): @@ -774,15 +711,10 @@ def apply(self, x: torch.Tensor, output = self.base_layer.quant_method.apply(self.base_layer, x, bias) if self.bias_stacked is not None: self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - (self.output_dim, self.output_dim), - self.bias_stacked, - ) + self.punica_wrapper.add_lora_packed_nslice( - output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, - (self.output_dim, self.output_dim)) + output, x, self.lora_a_stacked, self.lora_b_stacked, + self.bias_stacked, 1.0, (self.output_dim, self.output_dim)) return output @classmethod @@ -1131,15 +1063,10 @@ def apply(self, x: torch.Tensor, output = self.base_layer.quant_method.apply(self.base_layer, x, bias) if self.bias_stacked is not None: self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - self.output_slices, - self.bias_stacked, - ) self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0, + self.lora_b_stacked, + self.bias_stacked, 1.0, self.output_slices) return output @@ -1264,15 +1191,9 @@ def set_lora( def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 082041f390750..dcb69c231773a 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -450,6 +450,63 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) + def add_bias( + self, + indices: torch.Tensor, + output: torch.Tensor, + bias_stacked: torch.Tensor, + ): + """Applies bias to output + + Input shapes: + bias_stacked: (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) + bias_stacked = bias_stacked[indices] + bias_stacked[indices == -1] = 0 + output += bias_stacked + + return output.view_as(org_output) + + def add_bias_packed_nslice( + self, + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + ): + """Applies bias to output + + Input shapes: + bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + + offset_left += slice + + return output.view_as(org_output) + def add_shrink( self, y: torch.Tensor, @@ -499,7 +556,6 @@ def add_expand_slice(self, """ Similar to `add_expand` """ - expand_slice_fun: Callable = (self.expand_slice_prefill if self.is_prefill else self.expand_slice_decode) @@ -510,6 +566,7 @@ def add_lora(self, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], scale: float, y_offset: Optional[int] = None, y_slice_size: Optional[int] = None, @@ -544,7 +601,8 @@ def add_lora(self, buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - + if bias_all is not None: + x = self.add_bias(self.token_lora_indices, x, bias_all) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: self.add_expand(y, buffer, wb_t_all, add_input=True) @@ -558,13 +616,10 @@ def add_lora(self, y = y.view_as(y_org) def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - scale: float, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_all: Tuple[Optional[torch.Tensor], + ...], scale: float, output_slices: Tuple[int, ...]) -> None: """ Applies lora to each input. Similar to add_lora, This method is @@ -577,8 +632,9 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, offset_left = 0 # TODO fuse these kernels for slice_idx in range(len(output_slices)): + bias = bias_all[slice_idx] if bias_all is not None else None self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], scale, offset_left, + lora_b_stacked[slice_idx], bias, scale, offset_left, output_slices[slice_idx]) offset_left += output_slices[slice_idx] From aff0182e0b614f0bc6e1ab479bacd85f505a8a02 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 2 Dec 2024 07:20:25 +0000 Subject: [PATCH 02/22] Done 1/2 Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 11 ----------- vllm/lora/punica.py | 10 ++++++---- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 85a2402337131..5441b6c9336c3 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -342,9 +342,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, self.bias_stacked, 1.0) @@ -516,9 +513,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, self.bias_stacked, 1.0) @@ -709,9 +703,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - self.punica_wrapper.add_lora_packed_nslice( output, x, self.lora_a_stacked, self.lora_b_stacked, self.bias_stacked, 1.0, (self.output_dim, self.output_dim)) @@ -1061,8 +1052,6 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, self.lora_b_stacked, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index dcb69c231773a..6414dd49be719 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -479,7 +479,7 @@ def add_bias_packed_nslice( indices: torch.Tensor, output: torch.Tensor, output_slices: Tuple[int, ...], - bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + bias_stacked: Tuple[Optional[torch.Tensor], ...], ): """Applies bias to output @@ -602,7 +602,7 @@ def add_lora(self, dtype=torch.float32, device=x.device) if bias_all is not None: - x = self.add_bias(self.token_lora_indices, x, bias_all) + y = self.add_bias(self.token_lora_indices, y, bias_all) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: self.add_expand(y, buffer, wb_t_all, add_input=True) @@ -630,11 +630,13 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, x = x.view(-1, x.shape[-1]) y = y.view(-1, y.shape[-1]) offset_left = 0 + if bias_all is not None: + y = self.add_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_all) # TODO fuse these kernels for slice_idx in range(len(output_slices)): - bias = bias_all[slice_idx] if bias_all is not None else None self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], bias, scale, offset_left, + lora_b_stacked[slice_idx], None, scale, offset_left, output_slices[slice_idx]) offset_left += output_slices[slice_idx] From 20f8018d0b8ccf948269829fd8cd5421faed1084 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 2 Dec 2024 10:10:17 +0000 Subject: [PATCH 03/22] Done Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 41 +++++++++------------------- vllm/lora/layers.py | 3 ++- vllm/lora/punica.py | 45 +++++++++++++++++++++++++++---- 3 files changed, 54 insertions(+), 35 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index f5c2eced9d2bb..5f2d32defe030 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -73,6 +73,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, + self.bias_stacked, add_input=True) # now have column partitioned output @@ -131,27 +132,14 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): layer.lora_a_stacked[idx], 1.0) buffers = tensor_model_parallel_all_gather(buffers) - left_offset = 0 - for idx in range(n): - shard_size = layer.lora_b_stacked[idx].shape[2] - - if layer.bias_stacked is not None: - bias = layer.bias_stacked[idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[layer.punica_wrapper.token_lora_indices] - bias[layer.punica_wrapper.token_lora_indices == -1] = 0 - output[:, left_offset:left_offset + shard_size] += bias - - layer.punica_wrapper.add_expand_slice( - output, - buffers[idx], - layer.lora_b_stacked[idx], - left_offset, - shard_size, - add_input=True, - ) - left_offset += shard_size + layer.punica_wrapper.add_expand_packed_nslice( + output, + buffers, + layer.lora_b_stacked, + layer.bias_stacked, + 1.0, + layer.output_slices, + ) output = output.view(*out_orig_shape) # now have column partitioned and packed output @@ -234,6 +222,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, + self.bias_all, add_input=True) # now have column partitioned output output = output.view(*out_orig_shape) @@ -350,15 +339,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size - - if self.bias_stacked is not None: - bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) - bias = bias[self.punica_wrapper.token_lora_indices] - bias[self.punica_wrapper.token_lora_indices == -1] = 0 - output += bias - self.punica_wrapper.add_expand_slice(output, buffer, - self.lora_b_stacked, start_idx, + self.lora_b_stacked, + self.bias_stacked, start_idx, shard_size) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 5441b6c9336c3..73748b5ce511e 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -254,6 +254,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, self.lora_b_stacked, + bias_all=None, add_input=True) return full_output.view_as(full_output_org) @@ -618,8 +619,8 @@ def create_lora_weights( ) for _ in range(n_slices)) else: self.bias_stacked = None - self.output_dim = self.lora_b_stacked[0].shape[2] + self.output_slices = (self.output_dim, self.output_dim) def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 6414dd49be719..8808ba94977cb 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -502,7 +502,6 @@ def add_bias_packed_nslice( bias = bias[indices] bias[indices == -1] = 0 output[:, offset_left:offset_left + slice] += bias - offset_left += slice return output.view_as(org_output) @@ -531,17 +530,20 @@ def add_expand( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], add_input: bool = True, ): """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the + Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the GEMM of lora'b. When `is_prefill` is true, it indicates that it is currently the prefill stage, and the `expand_prefill` function should be called. Otherwise, it is the decode stage, and the expand_decode function should be called. """ - + if bias_all is not None: + y = self.add_bias(self.token_lora_indices, y, bias_all) + expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) expand_fun(y, x, w_t_all, add_input) @@ -550,17 +552,48 @@ def add_expand_slice(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool = True): """ Similar to `add_expand` """ + if bias_all is not None: + y = self.add_bias(self.token_lora_indices, y, bias_all) + expand_slice_fun: Callable = (self.expand_slice_prefill if self.is_prefill else self.expand_slice_decode) expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_stacked: Optional[Tuple[torch.Tensor, + ...]], + scale: float, + output_slices: Tuple[int, ...]) -> None: + """ + Similar to `add_expand` + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = 0 + if bias_stacked is not None: + self.add_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_stacked) + for slice_idx in range(len(lora_b_stacked)): + self.add_expand_slice(y, + x[slice_idx], + lora_b_stacked[slice_idx], + None, + offset_left, + output_slices[slice_idx], + add_input=True) + offset_left += output_slices[slice_idx] + + y = y.view_as(y_org) + def add_lora(self, y: torch.Tensor, x: torch.Tensor, @@ -579,12 +612,13 @@ def add_lora(self, @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) * scale - ).squeeze(0) + ).squeeze(0)+bias[i] Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor wa_t_all (torch.Tensor): lora_a's weight wb_t_all (torch.Tensor): lora_b's weight + bias_all: (torch.Tensor): lora's bias scale (float): Scaling factor. y_offset (Optional[int], optional): Offset to apply to the starting column of y. @@ -605,11 +639,12 @@ def add_lora(self, y = self.add_bias(self.token_lora_indices, y, bias_all) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: - self.add_expand(y, buffer, wb_t_all, add_input=True) + self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True) else: self.add_expand_slice(y, buffer, wb_t_all, + None, y_offset, y_slice_size, add_input=True) From 0a5aa735d3be3f1253c6636ac4bada8345581b88 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 2 Dec 2024 10:34:17 +0000 Subject: [PATCH 04/22] Add lora bias test --- tests/lora/test_llama_tp.py | 60 +++++++++++++++++-------------------- vllm/lora/punica.py | 2 +- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index aae6310a2a213..0b4bcb6554cbb 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -55,15 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@fork_new_process_for_each_test -def test_llama_lora(sql_lora_files): - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - +def generate_and_test(llm,sql_lora_files): print("lora adapter created") assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT @@ -79,6 +71,18 @@ def test_llama_lora(sql_lora_files): print("removing lora") +@fork_new_process_for_each_test +def test_llama_lora(sql_lora_files): + + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1) + generate_and_test(llm,sql_lora_files) + + + @fork_new_process_for_each_test def test_llama_lora_warmup(sql_lora_files): """Test that the LLM initialization works with a warmup LORA path and @@ -118,20 +122,7 @@ def test_llama_lora_tp4(sql_lora_files): max_loras=4, tensor_parallel_size=4, ) - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT - - print("removing lora") + generate_and_test(llm,sql_lora_files) @multi_gpu_test(num_gpus=4) @@ -146,16 +137,21 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): tensor_parallel_size=4, fully_sharded_loras=True, ) - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + generate_and_test(llm,sql_lora_files) - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - print("removing lora") + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + enable_lora_bias=True, + ) + generate_and_test(llm,sql_lora_files) \ No newline at end of file diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 8808ba94977cb..9b05b044a815e 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -591,7 +591,7 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, output_slices[slice_idx], add_input=True) offset_left += output_slices[slice_idx] - + y = y.view_as(y_org) def add_lora(self, From 6805805b0c55f636daf397da692456ddde56d05a Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 2 Dec 2024 15:38:18 +0000 Subject: [PATCH 05/22] Init --- tests/lora/test_llama_tp.py | 12 ++--- vllm/lora/fully_sharded_layers.py | 25 +++++---- vllm/lora/punica.py | 88 ++++++++++++++++--------------- 3 files changed, 62 insertions(+), 63 deletions(-) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 0b4bcb6554cbb..d3ca7f878191a 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -55,7 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -def generate_and_test(llm,sql_lora_files): +def generate_and_test(llm, sql_lora_files): print("lora adapter created") assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT @@ -79,8 +79,7 @@ def test_llama_lora(sql_lora_files): max_num_seqs=16, max_loras=4, tensor_parallel_size=1) - generate_and_test(llm,sql_lora_files) - + generate_and_test(llm, sql_lora_files) @fork_new_process_for_each_test @@ -122,7 +121,7 @@ def test_llama_lora_tp4(sql_lora_files): max_loras=4, tensor_parallel_size=4, ) - generate_and_test(llm,sql_lora_files) + generate_and_test(llm, sql_lora_files) @multi_gpu_test(num_gpus=4) @@ -137,8 +136,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): tensor_parallel_size=4, fully_sharded_loras=True, ) - generate_and_test(llm,sql_lora_files) - + generate_and_test(llm, sql_lora_files) @multi_gpu_test(num_gpus=4) @@ -154,4 +152,4 @@ def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): fully_sharded_loras=True, enable_lora_bias=True, ) - generate_and_test(llm,sql_lora_files) \ No newline at end of file + generate_and_test(llm, sql_lora_files) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 5f2d32defe030..143319afa94bc 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -76,14 +76,6 @@ def apply(self, x: torch.Tensor, self.bias_stacked, add_input=True) # now have column partitioned output - - if self.bias_stacked is not None: - self.bias_stacked = self.bias_stacked.view( - -1, self.bias_stacked.shape[-1]) - self.bias_stacked = self.bias_stacked[ - self.punica_wrapper.token_lora_indices] - output += self.bias_stacked - output = output.view(*out_orig_shape) return output @@ -338,11 +330,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # the output is not the same as a normal row_parallel, it should be # reduced before being used shard_size = self.lora_b_stacked.shape[2] - start_idx = self.tp_rank * shard_size - self.punica_wrapper.add_expand_slice(output, buffer, - self.lora_b_stacked, - self.bias_stacked, start_idx, - shard_size) + + # To be compatible with the input of the add_expand_packed_nslice, + # there is only one slice. + buffer = buffer.unsqueeze(dim=0) + self.punica_wrapper.add_expand_packed_nslice( + output, + buffer, + (self.lora_b_stacked, ), + (self.bias_stacked, ) if self.bias_stacked is not None else None, + 1.0, + (shard_size, ), + ) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 9b05b044a815e..6b071e88540ca 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -450,7 +450,28 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) - def add_bias( + def apply_expand_slice(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all+bias` + computation, which is suitable for the + GEMM of lora'b. + """ + if bias_all is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_all) + + expand_slice_fun: Callable = (self.expand_slice_prefill + if self.is_prefill else + self.expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + + def apply_bias( self, indices: torch.Tensor, output: torch.Tensor, @@ -474,7 +495,7 @@ def add_bias( return output.view_as(org_output) - def add_bias_packed_nslice( + def apply_bias_packed_nslice( self, indices: torch.Tensor, output: torch.Tensor, @@ -542,31 +563,12 @@ def add_expand( should be called. """ if bias_all is not None: - y = self.add_bias(self.token_lora_indices, y, bias_all) - + y = self.apply_bias(self.token_lora_indices, y, bias_all) + expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) expand_fun(y, x, w_t_all, add_input) - def add_expand_slice(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): - """ - Similar to `add_expand` - """ - if bias_all is not None: - y = self.add_bias(self.token_lora_indices, y, bias_all) - - expand_slice_fun: Callable = (self.expand_slice_prefill - if self.is_prefill else - self.expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) - def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, lora_b_stacked: Tuple[torch.Tensor, ...], bias_stacked: Optional[Tuple[torch.Tensor, @@ -580,18 +582,18 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, y = y.view(-1, y.shape[-1]) offset_left = 0 if bias_stacked is not None: - self.add_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_stacked) + self.apply_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_stacked) for slice_idx in range(len(lora_b_stacked)): - self.add_expand_slice(y, - x[slice_idx], - lora_b_stacked[slice_idx], - None, - offset_left, - output_slices[slice_idx], - add_input=True) + self.apply_expand_slice(y, + x[slice_idx], + lora_b_stacked[slice_idx], + None, + offset_left, + output_slices[slice_idx], + add_input=True) offset_left += output_slices[slice_idx] - + y = y.view_as(y_org) def add_lora(self, @@ -636,18 +638,18 @@ def add_lora(self, dtype=torch.float32, device=x.device) if bias_all is not None: - y = self.add_bias(self.token_lora_indices, y, bias_all) + y = self.apply_bias(self.token_lora_indices, y, bias_all) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True) else: - self.add_expand_slice(y, - buffer, - wb_t_all, - None, - y_offset, - y_slice_size, - add_input=True) + self.apply_expand_slice(y, + buffer, + wb_t_all, + None, + y_offset, + y_slice_size, + add_input=True) y = y.view_as(y_org) def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, @@ -666,8 +668,8 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, y = y.view(-1, y.shape[-1]) offset_left = 0 if bias_all is not None: - y = self.add_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_all) + y = self.apply_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_all) # TODO fuse these kernels for slice_idx in range(len(output_slices)): self.add_lora(y, x, lora_a_stacked[slice_idx], From c5c4598e3fec1c717bad3660e62a3afe9b9970bd Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Dec 2024 15:20:36 +0000 Subject: [PATCH 06/22] Modify layers backup Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 864 ++++++++++++++++++++++++++------------------ vllm/lora/models.py | 8 +- vllm/lora/punica.py | 5 +- 3 files changed, 513 insertions(+), 364 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 73748b5ce511e..f46a9470f61b7 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,7 +1,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -19,6 +19,7 @@ from vllm.distributed.utils import divide from vllm.lora.punica import PunicaWrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearBase, MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, @@ -135,6 +136,7 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None: self.base_layer = base_layer self.embeddings_slice: Optional[Tuple[int, int]] self.embeddings_weights: Optional[torch.Tensor] + self.n_slices = 1 def create_lora_weights( self, @@ -168,34 +170,36 @@ def create_lora_weights( dtype=self.base_layer.weight.dtype, device=self.base_layer.weight.device, ) - self.lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.org_vocab_size + - lora_config.lora_extra_vocab_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - self.base_layer.embedding_dim, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) - self.lora_a_stacked_2d = self.lora_a_stacked.view( - self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], - self.lora_a_stacked.shape[2], + self.lora_a_stacked = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.org_vocab_size + + lora_config.lora_extra_vocab_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + ( + max_loras, + 1, + self.base_layer.embedding_dim, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) for _ in range(self.n_slices)) + self.lora_a_stacked_2d = self.lora_a_stacked[0].view( + self.lora_a_stacked[0].shape[0] * self.lora_a_stacked[0].shape[1], + self.lora_a_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 + self.lora_a_stacked[0][index] = 0 + self.lora_b_stacked[0][index] = 0 self.embeddings_tensors[index] = 0 def set_lora( @@ -207,11 +211,12 @@ def set_lora( bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( - lora_a, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + self.lora_a_stacked[0][ + index, :lora_a.shape[0], :lora_a.shape[1]].copy_(lora_a, + non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, :embeddings_tensor.shape[0], :embeddings_tensor. @@ -253,7 +258,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Embedding layer only need expand op self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, - self.lora_b_stacked, + self.lora_b_stacked[0], bias_all=None, add_input=True) return full_output.view_as(full_output_org) @@ -269,14 +274,15 @@ def can_replace_layer( return type(source_layer) is VocabParallelEmbedding -class ReplicatedLinearWithLoRA(BaseLayerWithLoRA): +class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): - def __init__(self, base_layer: ReplicatedLinear) -> None: + def __init__(self, base_layer: LinearBase): super().__init__() self.base_layer = base_layer self.input_size = self.base_layer.input_size - self.output_size = self.base_layer.output_size self.device = _get_lora_device(self.base_layer) + self.output_slices: Tuple[int, ...] + self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None def create_lora_weights( self, @@ -285,39 +291,46 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - lora_a_output_size = lora_config.max_lora_rank - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( + lora_a_output_size_per_partition = ( + lora_config.max_lora_rank if not lora_config.fully_sharded_loras + else divide(lora_config.max_lora_rank, self.tp_size)) + self.lora_a_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_a_output_size_per_partition, + self.input_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( max_loras, 1, self.output_size, + lora_config.max_lora_rank, dtype=lora_config.lora_dtype, device=self.device, - ) - else: - self.bias_stacked = None + ) for _ in range(self.n_slices)) + if lora_config.bias_enabled: + self.bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.output_slices = (self.lora_b_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + for s_index in range(self.n_slices): + self.lora_a_stacked[s_index][index] = 0 + self.lora_b_stacked[s_index][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked = cast(Tuple[torch.Tensor, ...], + self.bias_stacked) + self.bias_stacked[s_index][index] = 0 def set_lora( self, @@ -329,25 +342,126 @@ def set_lora( ): self.reset_lora(index) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) + + self.lora_a_stacked[0][index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) + self.bias_stacked = cast(Tuple[torch.Tensor, ...], + self.bias_stacked) + self.bias_stacked[0][index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) + self.punica_wrapper.add_lora_packed_nslice(output, x, + self.lora_a_stacked, + self.lora_b_stacked, + self.bias_stacked, 1.0, + self.output_slices) return output + +class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): + + def __init__(self, base_layer: ReplicatedLinear) -> None: + super().__init__(base_layer, ) + self.tp_size = 1 #To ensure interface compatibility, it is set to 1 + self.output_size = self.base_layer.output_size + self.n_slices = 1 + + # def create_lora_weights( + # self, + # max_loras: int, + # lora_config: LoRAConfig, + # model_config: Optional[PretrainedConfig] = None, + # ) -> None: + # self.lora_config = lora_config + # lora_a_output_size_per_partition = ( + # lora_config.max_lora_rank if not lora_config.fully_sharded_loras + # else divide(lora_config.max_lora_rank, self.tp_size)) + # self.lora_a_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # lora_a_output_size_per_partition, + # self.input_size, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # self.lora_b_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # self.output_size, + # lora_config.max_lora_rank, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # if lora_config.bias_enabled: + # self.bias_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # self.output_size, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # self.output_slices = (self.lora_b_stacked[0].shape[2], ) + + # def reset_lora(self, index: int): + # for s_index in range(self.n_slices): + # self.lora_a_stacked[s_index][index] = 0 + # self.lora_b_stacked[s_index][index] = 0 + # if self.lora_config.bias_enabled: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[s_index][index] = 0 + + # def set_lora( + # self, + # index: int, + # lora_a: torch.Tensor, + # lora_b: torch.Tensor, + # embeddings_tensor: Optional[torch.Tensor], + # bias: Optional[torch.Tensor] = None, + # ): + # self.reset_lora(index) + + # self.lora_a_stacked[0][index, + # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + # lora_a.T, non_blocking=True) + # self.lora_b_stacked[0][index, + # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + # lora_b.T, non_blocking=True) + # if bias is not None: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[0][index, + # 0, :bias.shape[0]].copy_(bias.T, + # non_blocking=True) + + # def apply(self, x: torch.Tensor, + # bias: Optional[torch.Tensor]) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # self.punica_wrapper.add_lora_packed_nslice(output, x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.bias_stacked, 1.0, + # self.output_slices) + # return output + def forward(self, input_): """Forward of ReplicatedLinearWithLoRA @@ -380,7 +494,7 @@ def can_replace_layer( return type(source_layer) is ReplicatedLinear -class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): +class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): """ LoRA on top of ColumnParallelLinear layer. @@ -388,65 +502,68 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): """ def __init__(self, base_layer: ColumnParallelLinear) -> None: - super().__init__() + super().__init__(base_layer) # The base_layer type is ColumnParallelLinear or # MergedColumnParallelLinear, their weight sharding logic is # inconsistent when TP is greater than 1. self.is_merged_col_linear = type( base_layer) is MergedColumnParallelLinear - - self.base_layer = base_layer self.tp_size = get_tensor_model_parallel_world_size() - self.input_size = self.base_layer.input_size self.output_size = self.base_layer.output_size_per_partition - self.device = _get_lora_device(self.base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config - self.tp_size = get_tensor_model_parallel_world_size() - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - - self.output_dim = self.lora_b_stacked.shape[2] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + self.n_slices = 1 + # self.output_slices: Tuple[int, ...] + # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + # self.n_slices = 1 + + # def create_lora_weights( + # self, + # max_loras: int, + # lora_config: LoRAConfig, + # model_config: Optional[PretrainedConfig] = None, + # ) -> None: + # self.lora_config = lora_config + # lora_a_output_size_per_partition = ( + # lora_config.max_lora_rank if not lora_config.fully_sharded_loras + # else divide(lora_config.max_lora_rank, self.tp_size)) + # self.lora_a_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # lora_a_output_size_per_partition, + # self.input_size, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # self.lora_b_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # self.output_size, + # lora_config.max_lora_rank, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + + # if lora_config.bias_enabled: + # self.bias_stacked = tuple( + # torch.zeros( + # max_loras, + # 1, + # self.output_size, + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # self.output_dim = self.lora_b_stacked[0].shape[2] + # self.output_slices = (self.output_dim, ) + + # def reset_lora(self, index: int): + # for s_index in range(self.n_slices): + # self.lora_a_stacked[s_index][index] = 0 + # self.lora_b_stacked[s_index][index] = 0 + # if self.lora_config.bias_enabled: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[s_index][index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -485,39 +602,44 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = bias[start_idx:end_idx] return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) - return output + # def set_lora( + # self, + # index: int, + # lora_a: torch.Tensor, + # lora_b: torch.Tensor, + # embeddings_tensor: Optional[torch.Tensor], + # bias: Optional[torch.Tensor] = None, + # ): + # self.reset_lora(index) + + # if self.tp_size > 1: + # lora_a = self.slice_lora_a(lora_a) + # lora_b = self.slice_lora_b(lora_b) + # if bias is not None: + # bias = self.slice_bias(bias) + + # self.lora_a_stacked[0][index, + # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + # lora_a.T, non_blocking=True) + # self.lora_b_stacked[0][index, + # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + # lora_b.T, non_blocking=True) + # if bias is not None: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[0][index, + # 0, :bias.shape[0]].copy_(bias.T, + # non_blocking=True) + + # def apply(self, x: torch.Tensor, + # bias: Optional[torch.Tensor]) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # self.punica_wrapper.add_lora_packed_nslice(output, x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.bias_stacked, 1.0, + # self.output_slices) + # return output def forward(self, input_): """Forward of ColumnParallelLinear @@ -568,6 +690,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: MergedColumnParallelLinear) -> None: super().__init__(base_layer) + self.n_slices = 2 def create_lora_weights( self, @@ -576,8 +699,8 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - n_slices = 2 - if not (len(self.base_layer.output_sizes) == n_slices + + if not (len(self.base_layer.output_sizes) == self.n_slices and self.base_layer.output_sizes[0] == self.base_layer.output_sizes[1]): raise ValueError( @@ -598,7 +721,7 @@ def create_lora_weights( self.input_size, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for _ in range(self.n_slices)) self.lora_b_stacked = tuple( torch.zeros( max_loras, @@ -607,7 +730,7 @@ def create_lora_weights( lora_config.max_lora_rank, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for _ in range(self.n_slices)) if lora_config.bias_enabled: self.bias_stacked = tuple( torch.zeros( @@ -616,20 +739,18 @@ def create_lora_weights( self.output_size // 2, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) - else: - self.bias_stacked = None + ) for _ in range(self.n_slices)) self.output_dim = self.lora_b_stacked[0].shape[2] self.output_slices = (self.output_dim, self.output_dim) - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_b_stacked[1][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 + # def reset_lora(self, index: int): + # self.lora_a_stacked[0][index] = 0 + # self.lora_a_stacked[1][index] = 0 + # self.lora_b_stacked[0][index] = 0 + # self.lora_b_stacked[1][index] = 0 + # if self.lora_config.bias_enabled: + # self.bias_stacked[0][index] = 0 + # self.bias_stacked[1][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -686,6 +807,8 @@ def set_lora( index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) if bias is not None and bias[0] is not None: + self.bias_stacked = cast(Tuple[torch.Tensor, ...], + self.bias_stacked) self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(bias[0].T, non_blocking=True) @@ -697,17 +820,19 @@ def set_lora( index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) if bias is not None and bias[1] is not None: + self.bias_stacked = cast(Tuple[torch.Tensor, ...], + self.bias_stacked) self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_(bias[1].T, non_blocking=True) - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_packed_nslice( - output, x, self.lora_a_stacked, self.lora_b_stacked, - self.bias_stacked, 1.0, (self.output_dim, self.output_dim)) - return output + # def apply(self, x: torch.Tensor, + # bias: Optional[torch.Tensor]) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # self.punica_wrapper.add_lora_packed_nslice( + # output, x, self.lora_a_stacked, self.lora_b_stacked, + # self.bias_stacked, 1.0, self.output_slices) + # return output @classmethod @_not_fully_sharded_can_replace @@ -746,6 +871,7 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: self.base_layer.head_size) self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + self.n_slices = 1 def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() @@ -780,31 +906,33 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = torch.cat([bias_q, bias_k, bias_v], dim=1) return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) + # def set_lora( + # self, + # index: int, + # lora_a: torch.Tensor, + # lora_b: torch.Tensor, + # embeddings_tensor: Optional[torch.Tensor], + # bias: Optional[torch.Tensor] = None, + # ): + # self.reset_lora(index) + # if self.tp_size > 1: + # lora_a = self.slice_lora_a(lora_a) + # lora_b = self.slice_lora_b(lora_b) + # if bias is not None: + # bias = self.slice_bias(bias) + + # self.lora_a_stacked[0][index, + # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + # lora_a.T, non_blocking=True) + # self.lora_b_stacked[0][index, + # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + # lora_b.T, non_blocking=True) + # if bias is not None: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[0][index, + # 0, :bias.shape[0]].copy_(bias.T, + # non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -828,6 +956,7 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) + self.n_slices = 3 def create_lora_weights( self, @@ -925,9 +1054,6 @@ def create_lora_weights( device=self.device, ), ) - else: - self.bias_stacked = None - self.output_slices = ( self.q_proj_shard_size, self.kv_proj_shard_size, @@ -939,17 +1065,17 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: List[int] - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[1][index] = 0 - self.lora_a_stacked[2][index] = 0 - self.lora_b_stacked[2][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 - self.bias_stacked[2][index] = 0 + # def reset_lora(self, index: int): + # self.lora_a_stacked[0][index] = 0 + # self.lora_b_stacked[0][index] = 0 + # self.lora_a_stacked[1][index] = 0 + # self.lora_b_stacked[1][index] = 0 + # self.lora_a_stacked[2][index] = 0 + # self.lora_b_stacked[2][index] = 0 + # if self.lora_config.bias_enabled: + # self.bias_stacked[0][index] = 0 + # self.bias_stacked[1][index] = 0 + # self.bias_stacked[2][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -1040,6 +1166,8 @@ def set_lora( lora_a[2].T, non_blocking=True) if bias is not None: + self.bias_stacked = cast(Tuple[torch.Tensor, ...], + self.bias_stacked) if bias[0] is not None: self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( bias[0].T, non_blocking=True) @@ -1050,15 +1178,15 @@ def set_lora( self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( bias[2].T, non_blocking=True) - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_packed_nslice(output, x, - self.lora_a_stacked, - self.lora_b_stacked, - self.bias_stacked, 1.0, - self.output_slices) - return output + # def apply(self, x: torch.Tensor, + # bias: Optional[torch.Tensor]) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + # self.punica_wrapper.add_lora_packed_nslice(output, x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.bias_stacked, 1.0, + # self.output_slices) + # return output @classmethod @_not_fully_sharded_can_replace @@ -1073,70 +1201,83 @@ def can_replace_layer( and len(packed_modules_list) == 3) -class RowParallelLinearWithLoRA(BaseLayerWithLoRA): +class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): + # def __init__(self,base_layer,n_slices): + # self.base_layer = base_layer + # self.input_size = self.base_layer.input_size + # self.device = _get_lora_device(self.base_layer) + # self.output_slices: Tuple[int, ...] + # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + # self.n_slices = n_slices def __init__(self, base_layer: RowParallelLinear) -> None: - super().__init__() - self.base_layer = base_layer + super().__init__(base_layer) + + self.tp_size = get_tensor_model_parallel_world_size() self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size - self.device = _get_lora_device(self.base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config - self.tp_rank = get_tensor_model_parallel_rank() - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.input_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - tp_size = get_tensor_model_parallel_world_size() - lora_b_output_size_per_partition = ( - self.output_size if not lora_config.fully_sharded_loras else - divide(self.output_size, tp_size)) - - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - lora_b_output_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - ( - max_loras, - 1, - self.output_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - # Lazily initialized - self.indices: torch.Tensor - self.indices_len: List[int] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + self.n_slices = 1 + + # def create_lora_weights( + # self, + # max_loras: int, + # lora_config: LoRAConfig, + # model_config: Optional[PretrainedConfig] = None, + # ) -> None: + # self.lora_config = lora_config + # self.tp_rank = get_tensor_model_parallel_rank() + # self.lora_a_stacked = tuple( + # torch.zeros( + # ( + # max_loras, + # 1, + # lora_config.max_lora_rank, + # self.input_size, + # ), + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # tp_size = get_tensor_model_parallel_world_size() + # lora_b_output_size_per_partition = ( + # self.output_size if not lora_config.fully_sharded_loras else + # divide(self.output_size, tp_size)) + + # self.lora_b_stacked = tuple( + # torch.zeros( + # ( + # max_loras, + # 1, + # lora_b_output_size_per_partition, + # lora_config.max_lora_rank, + # ), + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + + # if lora_config.bias_enabled: + # self.bias_stacked = tuple( + # torch.zeros( + # ( + # max_loras, + # 1, + # self.output_size, + # ), + # dtype=lora_config.lora_dtype, + # device=self.device, + # ) for _ in range(self.n_slices)) + # # Lazily initialized + # self.output_slices = (self.lora_b_stacked[0].shape[2], ) + # self.indices: torch.Tensor + # self.indices_len: List[int] + + # def reset_lora(self, index: int): + # for s_index in range(self.n_slices): + # self.lora_a_stacked[s_index][index] = 0 + # self.lora_b_stacked[s_index][index] = 0 + # if self.lora_config.bias_enabled: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[s_index][index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tensor_model_parallel_rank = get_tensor_model_parallel_rank() @@ -1152,39 +1293,43 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.base_layer.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) - return output + # def set_lora( + # self, + # index: int, + # lora_a: torch.Tensor, + # lora_b: torch.Tensor, + # embeddings_tensor: Optional[torch.Tensor], + # bias: Optional[torch.Tensor] = None, + # ): + # self.reset_lora(index) + + # if self.base_layer.tp_size > 1: + # lora_a = self.slice_lora_a(lora_a) + # lora_b = self.slice_lora_b(lora_b) + # if bias is not None: + # bias = self.slice_bias(bias) + + # self.lora_a_stacked[0][index, + # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + # lora_a.T, non_blocking=True) + # self.lora_b_stacked[0][index, + # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + # lora_b.T, non_blocking=True) + # if bias is not None: + # self.bias_stacked = cast(Tuple[torch.Tensor, ...], + # self.bias_stacked) + # self.bias_stacked[0][index, + # 0, :bias.shape[0]].copy_(bias.T, + # non_blocking=True) + + # def apply(self, x: torch.Tensor) -> torch.Tensor: + # output = self.base_layer.quant_method.apply(self.base_layer, x) + # self.punica_wrapper.add_lora_packed_nslice(output, x, + # self.lora_a_stacked, + # self.lora_b_stacked, + # self.bias_stacked, 1.0, + # self.output_slices) + # return output def forward(self, input_): """Forward of RowParallelLinear @@ -1267,6 +1412,7 @@ def __init__(self, base_layer: LogitsProcessor, hidden_size: int, self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.sharded_to_full_mapping = sharded_to_full_mapping + self.n_slices = 1 @property def logits_as_input(self): @@ -1310,29 +1456,32 @@ def create_lora_weights( if 32000 < self.base_layer.vocab_size > 257024: raise ValueError("When using LoRA, vocab size must be " "32000 >= vocab_size <= 257024") - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - # Pad for kernel compatibility - math.ceil(self.base_layer.vocab_size / - lora_config.lora_vocab_padding_size) * - lora_config.lora_vocab_padding_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) + + self.lora_a_stacked = tuple( + torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + ( + max_loras, + 1, + # Pad for kernel compatibility + math.ceil(self.base_layer.vocab_size / + lora_config.lora_vocab_padding_size) * + lora_config.lora_vocab_padding_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) self.embeddings_tensors = torch.full( (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), fill_value=float("-inf"), @@ -1346,10 +1495,11 @@ def create_lora_weights( dtype=torch.long) else: self.sharded_to_full_mapping_gpu = None + self.output_slices = (self.lora_b_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 + self.lora_a_stacked[0][index] = 0 + self.lora_b_stacked[0][index] = 0 self.embeddings_tensors[index] = float("-inf") def set_lora( @@ -1361,12 +1511,12 @@ def set_lora( bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + self.lora_a_stacked[0][index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, :embeddings_tensor.shape[0], :embeddings_tensor. @@ -1430,8 +1580,8 @@ def _get_logits( # LogitsProcessorWithLoRA always using bgmv self.punica_wrapper.add_lora_logits(logits, hidden_states, - self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_a_stacked[0], + self.lora_b_stacked[0], 1.0) # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 2ffefe61427e3..9855b57d0c9c9 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -555,17 +555,17 @@ def create_dummy_lora( input_dim, output_dim, rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", embeddings_tensor_dim=embeddings_tensor_dim, bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, - module.lora_a_stacked.shape[-1], - module.lora_b_stacked.shape[-2], + module.lora_a_stacked[0].shape[-1], + module.lora_b_stacked[0].shape[-2], rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", bias_enabled=bias_enabled, ) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 4ebc92a949e84..f2fed6a485f64 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -596,7 +596,6 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, y = y.view_as(y_org) - def add_lora(self, y: torch.Tensor, x: torch.Tensor, @@ -656,8 +655,8 @@ def add_lora(self, def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], lora_b_stacked: Tuple[torch.Tensor, ...], - bias_all: Tuple[Optional[torch.Tensor], - ...], scale: float, + bias_all: Optional[Tuple[torch.Tensor, + ...]], scale: float, output_slices: Tuple[int, ...]) -> None: """ Applies lora to each input. Similar to add_lora, This method is From 0225059af2c176701d22642ff092d3af51255df4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 01:14:02 +0000 Subject: [PATCH 07/22] Modify layers.py Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 350 +------------------------------------------- 1 file changed, 2 insertions(+), 348 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index f46a9470f61b7..0a25b7e97f8f0 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -381,87 +381,6 @@ def __init__(self, base_layer: ReplicatedLinear) -> None: self.output_size = self.base_layer.output_size self.n_slices = 1 - # def create_lora_weights( - # self, - # max_loras: int, - # lora_config: LoRAConfig, - # model_config: Optional[PretrainedConfig] = None, - # ) -> None: - # self.lora_config = lora_config - # lora_a_output_size_per_partition = ( - # lora_config.max_lora_rank if not lora_config.fully_sharded_loras - # else divide(lora_config.max_lora_rank, self.tp_size)) - # self.lora_a_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # lora_a_output_size_per_partition, - # self.input_size, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # self.lora_b_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # self.output_size, - # lora_config.max_lora_rank, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # if lora_config.bias_enabled: - # self.bias_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # self.output_size, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # self.output_slices = (self.lora_b_stacked[0].shape[2], ) - - # def reset_lora(self, index: int): - # for s_index in range(self.n_slices): - # self.lora_a_stacked[s_index][index] = 0 - # self.lora_b_stacked[s_index][index] = 0 - # if self.lora_config.bias_enabled: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[s_index][index] = 0 - - # def set_lora( - # self, - # index: int, - # lora_a: torch.Tensor, - # lora_b: torch.Tensor, - # embeddings_tensor: Optional[torch.Tensor], - # bias: Optional[torch.Tensor] = None, - # ): - # self.reset_lora(index) - - # self.lora_a_stacked[0][index, - # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - # lora_a.T, non_blocking=True) - # self.lora_b_stacked[0][index, - # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - # lora_b.T, non_blocking=True) - # if bias is not None: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[0][index, - # 0, :bias.shape[0]].copy_(bias.T, - # non_blocking=True) - - # def apply(self, x: torch.Tensor, - # bias: Optional[torch.Tensor]) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # self.punica_wrapper.add_lora_packed_nslice(output, x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.bias_stacked, 1.0, - # self.output_slices) - # return output - def forward(self, input_): """Forward of ReplicatedLinearWithLoRA @@ -511,59 +430,6 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None: self.tp_size = get_tensor_model_parallel_world_size() self.output_size = self.base_layer.output_size_per_partition self.n_slices = 1 - # self.output_slices: Tuple[int, ...] - # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None - # self.n_slices = 1 - - # def create_lora_weights( - # self, - # max_loras: int, - # lora_config: LoRAConfig, - # model_config: Optional[PretrainedConfig] = None, - # ) -> None: - # self.lora_config = lora_config - # lora_a_output_size_per_partition = ( - # lora_config.max_lora_rank if not lora_config.fully_sharded_loras - # else divide(lora_config.max_lora_rank, self.tp_size)) - # self.lora_a_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # lora_a_output_size_per_partition, - # self.input_size, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # self.lora_b_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # self.output_size, - # lora_config.max_lora_rank, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - - # if lora_config.bias_enabled: - # self.bias_stacked = tuple( - # torch.zeros( - # max_loras, - # 1, - # self.output_size, - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # self.output_dim = self.lora_b_stacked[0].shape[2] - # self.output_slices = (self.output_dim, ) - - # def reset_lora(self, index: int): - # for s_index in range(self.n_slices): - # self.lora_a_stacked[s_index][index] = 0 - # self.lora_b_stacked[s_index][index] = 0 - # if self.lora_config.bias_enabled: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[s_index][index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -602,45 +468,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = bias[start_idx:end_idx] return bias - # def set_lora( - # self, - # index: int, - # lora_a: torch.Tensor, - # lora_b: torch.Tensor, - # embeddings_tensor: Optional[torch.Tensor], - # bias: Optional[torch.Tensor] = None, - # ): - # self.reset_lora(index) - - # if self.tp_size > 1: - # lora_a = self.slice_lora_a(lora_a) - # lora_b = self.slice_lora_b(lora_b) - # if bias is not None: - # bias = self.slice_bias(bias) - - # self.lora_a_stacked[0][index, - # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - # lora_a.T, non_blocking=True) - # self.lora_b_stacked[0][index, - # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - # lora_b.T, non_blocking=True) - # if bias is not None: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[0][index, - # 0, :bias.shape[0]].copy_(bias.T, - # non_blocking=True) - - # def apply(self, x: torch.Tensor, - # bias: Optional[torch.Tensor]) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # self.punica_wrapper.add_lora_packed_nslice(output, x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.bias_stacked, 1.0, - # self.output_slices) - # return output - def forward(self, input_): """Forward of ColumnParallelLinear @@ -743,15 +570,6 @@ def create_lora_weights( self.output_dim = self.lora_b_stacked[0].shape[2] self.output_slices = (self.output_dim, self.output_dim) - # def reset_lora(self, index: int): - # self.lora_a_stacked[0][index] = 0 - # self.lora_a_stacked[1][index] = 0 - # self.lora_b_stacked[0][index] = 0 - # self.lora_b_stacked[1][index] = 0 - # if self.lora_config.bias_enabled: - # self.bias_stacked[0][index] = 0 - # self.bias_stacked[1][index] = 0 - def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: @@ -826,14 +644,6 @@ def set_lora( 0, :bias[1].shape[0]].copy_(bias[1].T, non_blocking=True) - # def apply(self, x: torch.Tensor, - # bias: Optional[torch.Tensor]) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # self.punica_wrapper.add_lora_packed_nslice( - # output, x, self.lora_a_stacked, self.lora_b_stacked, - # self.bias_stacked, 1.0, self.output_slices) - # return output - @classmethod @_not_fully_sharded_can_replace def can_replace_layer( @@ -862,7 +672,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) - self.tp_size = get_tensor_model_parallel_world_size() self.q_proj_total_size = (self.base_layer.total_num_heads * self.base_layer.head_size) self.q_proj_shard_size = (self.base_layer.num_heads * @@ -905,35 +714,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: self.kv_proj_shard_size * (self.kv_shard_id + 1)] bias = torch.cat([bias_q, bias_k, bias_v], dim=1) return bias - - # def set_lora( - # self, - # index: int, - # lora_a: torch.Tensor, - # lora_b: torch.Tensor, - # embeddings_tensor: Optional[torch.Tensor], - # bias: Optional[torch.Tensor] = None, - # ): - # self.reset_lora(index) - # if self.tp_size > 1: - # lora_a = self.slice_lora_a(lora_a) - # lora_b = self.slice_lora_b(lora_b) - # if bias is not None: - # bias = self.slice_bias(bias) - - # self.lora_a_stacked[0][index, - # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - # lora_a.T, non_blocking=True) - # self.lora_b_stacked[0][index, - # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - # lora_b.T, non_blocking=True) - # if bias is not None: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[0][index, - # 0, :bias.shape[0]].copy_(bias.T, - # non_blocking=True) - + @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, @@ -1065,18 +846,6 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: List[int] - # def reset_lora(self, index: int): - # self.lora_a_stacked[0][index] = 0 - # self.lora_b_stacked[0][index] = 0 - # self.lora_a_stacked[1][index] = 0 - # self.lora_b_stacked[1][index] = 0 - # self.lora_a_stacked[2][index] = 0 - # self.lora_b_stacked[2][index] = 0 - # if self.lora_config.bias_enabled: - # self.bias_stacked[0][index] = 0 - # self.bias_stacked[1][index] = 0 - # self.bias_stacked[2][index] = 0 - def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: @@ -1178,16 +947,6 @@ def set_lora( self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( bias[2].T, non_blocking=True) - # def apply(self, x: torch.Tensor, - # bias: Optional[torch.Tensor]) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # self.punica_wrapper.add_lora_packed_nslice(output, x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.bias_stacked, 1.0, - # self.output_slices) - # return output - @classmethod @_not_fully_sharded_can_replace def can_replace_layer( @@ -1203,82 +962,15 @@ def can_replace_layer( class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): - # def __init__(self,base_layer,n_slices): - # self.base_layer = base_layer - # self.input_size = self.base_layer.input_size - # self.device = _get_lora_device(self.base_layer) - # self.output_slices: Tuple[int, ...] - # self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None - # self.n_slices = n_slices def __init__(self, base_layer: RowParallelLinear) -> None: super().__init__(base_layer) self.tp_size = get_tensor_model_parallel_world_size() + # reset input_size self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size self.n_slices = 1 - # def create_lora_weights( - # self, - # max_loras: int, - # lora_config: LoRAConfig, - # model_config: Optional[PretrainedConfig] = None, - # ) -> None: - # self.lora_config = lora_config - # self.tp_rank = get_tensor_model_parallel_rank() - # self.lora_a_stacked = tuple( - # torch.zeros( - # ( - # max_loras, - # 1, - # lora_config.max_lora_rank, - # self.input_size, - # ), - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # tp_size = get_tensor_model_parallel_world_size() - # lora_b_output_size_per_partition = ( - # self.output_size if not lora_config.fully_sharded_loras else - # divide(self.output_size, tp_size)) - - # self.lora_b_stacked = tuple( - # torch.zeros( - # ( - # max_loras, - # 1, - # lora_b_output_size_per_partition, - # lora_config.max_lora_rank, - # ), - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - - # if lora_config.bias_enabled: - # self.bias_stacked = tuple( - # torch.zeros( - # ( - # max_loras, - # 1, - # self.output_size, - # ), - # dtype=lora_config.lora_dtype, - # device=self.device, - # ) for _ in range(self.n_slices)) - # # Lazily initialized - # self.output_slices = (self.lora_b_stacked[0].shape[2], ) - # self.indices: torch.Tensor - # self.indices_len: List[int] - - # def reset_lora(self, index: int): - # for s_index in range(self.n_slices): - # self.lora_a_stacked[s_index][index] = 0 - # self.lora_b_stacked[s_index][index] = 0 - # if self.lora_config.bias_enabled: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[s_index][index] = 0 - def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tensor_model_parallel_rank = get_tensor_model_parallel_rank() shard_size = self.input_size @@ -1293,44 +985,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - # def set_lora( - # self, - # index: int, - # lora_a: torch.Tensor, - # lora_b: torch.Tensor, - # embeddings_tensor: Optional[torch.Tensor], - # bias: Optional[torch.Tensor] = None, - # ): - # self.reset_lora(index) - - # if self.base_layer.tp_size > 1: - # lora_a = self.slice_lora_a(lora_a) - # lora_b = self.slice_lora_b(lora_b) - # if bias is not None: - # bias = self.slice_bias(bias) - - # self.lora_a_stacked[0][index, - # 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - # lora_a.T, non_blocking=True) - # self.lora_b_stacked[0][index, - # 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - # lora_b.T, non_blocking=True) - # if bias is not None: - # self.bias_stacked = cast(Tuple[torch.Tensor, ...], - # self.bias_stacked) - # self.bias_stacked[0][index, - # 0, :bias.shape[0]].copy_(bias.T, - # non_blocking=True) - - # def apply(self, x: torch.Tensor) -> torch.Tensor: - # output = self.base_layer.quant_method.apply(self.base_layer, x) - # self.punica_wrapper.add_lora_packed_nslice(output, x, - # self.lora_a_stacked, - # self.lora_b_stacked, - # self.bias_stacked, 1.0, - # self.output_slices) - # return output - def forward(self, input_): """Forward of RowParallelLinear From 88366e8e4117d0a23ed343255d6ee8e205384e18 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 01:27:01 +0000 Subject: [PATCH 08/22] Revert embedding and logits layer Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 139 +++++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 74 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 0a25b7e97f8f0..e24a321680fdb 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -136,7 +136,6 @@ def __init__(self, base_layer: VocabParallelEmbedding) -> None: self.base_layer = base_layer self.embeddings_slice: Optional[Tuple[int, int]] self.embeddings_weights: Optional[torch.Tensor] - self.n_slices = 1 def create_lora_weights( self, @@ -170,36 +169,34 @@ def create_lora_weights( dtype=self.base_layer.weight.dtype, device=self.base_layer.weight.device, ) - self.lora_a_stacked = tuple( - torch.zeros( - ( - max_loras, - self.base_layer.org_vocab_size + - lora_config.lora_extra_vocab_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - torch.zeros( - ( - max_loras, - 1, - self.base_layer.embedding_dim, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.base_layer.weight.device, - ) for _ in range(self.n_slices)) - self.lora_a_stacked_2d = self.lora_a_stacked[0].view( - self.lora_a_stacked[0].shape[0] * self.lora_a_stacked[0].shape[1], - self.lora_a_stacked[0].shape[2], + self.lora_a_stacked = torch.zeros( + ( + max_loras, + self.base_layer.org_vocab_size + + lora_config.lora_extra_vocab_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + self.base_layer.embedding_dim, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.base_layer.weight.device, + ) + self.lora_a_stacked_2d = self.lora_a_stacked.view( + self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1], + self.lora_a_stacked.shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 self.embeddings_tensors[index] = 0 def set_lora( @@ -211,12 +208,11 @@ def set_lora( bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[0][ - index, :lora_a.shape[0], :lora_a.shape[1]].copy_(lora_a, - non_blocking=True) - self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( + lora_a, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, :embeddings_tensor.shape[0], :embeddings_tensor. @@ -258,7 +254,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # Embedding layer only need expand op self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, - self.lora_b_stacked[0], + self.lora_b_stacked, bias_all=None, add_input=True) return full_output.view_as(full_output_org) @@ -714,7 +710,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: self.kv_proj_shard_size * (self.kv_shard_id + 1)] bias = torch.cat([bias_q, bias_k, bias_v], dim=1) return bias - + @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, @@ -1066,7 +1062,6 @@ def __init__(self, base_layer: LogitsProcessor, hidden_size: int, self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.sharded_to_full_mapping = sharded_to_full_mapping - self.n_slices = 1 @property def logits_as_input(self): @@ -1110,32 +1105,29 @@ def create_lora_weights( if 32000 < self.base_layer.vocab_size > 257024: raise ValueError("When using LoRA, vocab size must be " "32000 >= vocab_size <= 257024") - - self.lora_a_stacked = tuple( - torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) - self.lora_b_stacked = tuple( - torch.zeros( - ( - max_loras, - 1, - # Pad for kernel compatibility - math.ceil(self.base_layer.vocab_size / - lora_config.lora_vocab_padding_size) * - lora_config.lora_vocab_padding_size, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) for _ in range(self.n_slices)) + self.lora_a_stacked = torch.zeros( + ( + max_loras, + 1, + lora_config.max_lora_rank, + self.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + self.lora_b_stacked = torch.zeros( + ( + max_loras, + 1, + # Pad for kernel compatibility + math.ceil(self.base_layer.vocab_size / + lora_config.lora_vocab_padding_size) * + lora_config.lora_vocab_padding_size, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) self.embeddings_tensors = torch.full( (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), fill_value=float("-inf"), @@ -1149,11 +1141,10 @@ def create_lora_weights( dtype=torch.long) else: self.sharded_to_full_mapping_gpu = None - self.output_slices = (self.lora_b_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 + self.lora_a_stacked[index] = 0 + self.lora_b_stacked[index] = 0 self.embeddings_tensors[index] = float("-inf") def set_lora( @@ -1165,12 +1156,12 @@ def set_lora( bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - self.lora_a_stacked[0][index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[0][index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) + self.lora_a_stacked[index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ index, :embeddings_tensor.shape[0], :embeddings_tensor. @@ -1234,8 +1225,8 @@ def _get_logits( # LogitsProcessorWithLoRA always using bgmv self.punica_wrapper.add_lora_logits(logits, hidden_states, - self.lora_a_stacked[0], - self.lora_b_stacked[0], 1.0) + self.lora_a_stacked, + self.lora_b_stacked, 1.0) # Remove paddings in vocab (if any). logits = logits[:, :self.base_layer.vocab_size] From b446a3c0a8dab537ca6d92af1c2b3dbb78c5f573 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 08:35:22 +0000 Subject: [PATCH 09/22] Modify layers.py and fs_layers.py Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 126 +++++++++++++----------- vllm/lora/layers.py | 69 ++++++++----- vllm/lora/punica.py | 156 +++++++++++++++++++++++------- 3 files changed, 233 insertions(+), 118 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 34d4ffbdb1778..c7a13f83f48eb 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,5 +1,5 @@ # pylint: disable=unused-argument -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -51,30 +51,35 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): # gather operation. def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + # TODO add doc buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), dtype=torch.float32, device=x.device, ) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) + self.punica_wrapper.add_shrink_packed_nslice(buffer, x, + self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - add_input=True) + self.punica_wrapper.add_expand_packed_nslice(output, + buffer, + self.lora_b_stacked, + self.bias_stacked, + self.output_slices, + add_input=True) + # now have column partitioned output output = output.view(*out_orig_shape) return output @@ -109,29 +114,25 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): MergedColumnParallelLinearWithShardedLoRA. """ # expecting 2 for column parallel and 3 for qkv - n = len(layer.lora_a_stacked) + assert len(layer.lora_a_stacked) == layer.n_slices output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape buffers = torch.zeros( - (n, x.shape[0], layer.lora_a_stacked[0].shape[2]), + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), dtype=torch.float32, device=x.device, ) - for idx in range(n): - layer.punica_wrapper.add_shrink(buffers[idx], x, - layer.lora_a_stacked[idx], 1.0) - + layer.punica_wrapper.add_shrink_packed_nslice(buffers, x, + layer.lora_a_stacked, 1.0) buffers = tensor_model_parallel_all_gather(buffers) - layer.punica_wrapper.add_expand_packed_nslice( - output, - buffers, - layer.lora_b_stacked, - layer.bias_stacked, - 1.0, - layer.output_slices, - ) + layer.punica_wrapper.add_expand_packed_nslice(output, + buffers, + layer.lora_b_stacked, + layer.bias_stacked, + layer.output_slices, + add_input=True) output = output.view(*out_orig_shape) # now have column partitioned and packed output @@ -161,8 +162,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -194,28 +196,33 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) + buffer = torch.zeros( + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + self.punica_wrapper.add_shrink_packed_nslice(buffer, x, + self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - add_input=True) + self.punica_wrapper.add_expand_packed_nslice(output, + buffer, + self.lora_b_stacked, + self.bias_stacked, + self.output_slices, + add_input=True) # now have column partitioned output output = output.view(*out_orig_shape) return output @@ -259,8 +266,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -293,7 +301,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): """ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - shard_size = self.lora_b_stacked.shape[2] + shard_size = self.lora_b_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = lora_b[:, start_idx:end_idx] @@ -302,25 +310,29 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - shard_size = self.bias_stacked.shape[2] + self.bias_stacked = cast(Tuple[torch.Tensor, ...], self.bias_stacked) + shard_size = self.bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size bias = bias[start_idx:end_idx] return bias - def apply(self, x: torch.Tensor) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), dtype=torch.float32, device=x.device, ) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) + self.punica_wrapper.add_shrink_packed_nslice(buffer, x, + self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_reduce(buffer) # following S-LoRA, allows the fusing of all_gather and all_reduce @@ -329,19 +341,15 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # remains is a standard all_reduce. User should be aware though that # the output is not the same as a normal row_parallel, it should be # reduced before being used - shard_size = self.lora_b_stacked.shape[2] - - # To be compatible with the input of the add_expand_packed_nslice, - # there is only one slice. - buffer = buffer.unsqueeze(dim=0) - self.punica_wrapper.add_expand_packed_nslice( - output, - buffer, - (self.lora_b_stacked, ), - (self.bias_stacked, ) if self.bias_stacked is not None else None, - 1.0, - (shard_size, ), - ) + + # TODO:add DOC + buffer = buffer.squeeze(dim=0) + shard_size = self.lora_b_stacked[0].shape[2] + start_idx = self.tp_rank * shard_size + self.punica_wrapper.add_expand_slice( + output, buffer, self.lora_b_stacked[0], + self.bias_stacked[0] if self.bias_stacked is not None else None, + start_idx, shard_size) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e24a321680fdb..1c63e300b7838 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -250,13 +250,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - - # Embedding layer only need expand op - self.punica_wrapper.add_expand(full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - bias_all=None, - add_input=True) + self.punica_wrapper.add_lora_embedding(full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) return full_output.view_as(full_output_org) @classmethod @@ -277,9 +274,11 @@ def __init__(self, base_layer: LinearBase): self.base_layer = base_layer self.input_size = self.base_layer.input_size self.device = _get_lora_device(self.base_layer) - self.output_slices: Tuple[int, ...] self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + self.output_slices: Tuple[int, ...] + self.tp_size: int + def create_lora_weights( self, max_loras: int, @@ -287,14 +286,31 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) + + if isinstance(self.base_layer, ReplicatedLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, ColumnParallelLinear): + lora_a_out_size = (lora_config.max_lora_rank if + not lora_config.fully_sharded_loras else divide( + lora_config.max_lora_rank, self.tp_size)) + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, RowParallelLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = (self.output_size if + not lora_config.fully_sharded_loras else divide( + self.output_size, self.tp_size)) + else: + raise NotImplementedError + + lora_bias_out_size = self.output_size self.lora_a_stacked = tuple( torch.zeros( max_loras, 1, - lora_a_output_size_per_partition, + lora_a_out_size, self.input_size, dtype=lora_config.lora_dtype, device=self.device, @@ -303,7 +319,7 @@ def create_lora_weights( torch.zeros( max_loras, 1, - self.output_size, + lora_b_out_size, lora_config.max_lora_rank, dtype=lora_config.lora_dtype, device=self.device, @@ -313,7 +329,7 @@ def create_lora_weights( torch.zeros( max_loras, 1, - self.output_size, + lora_bias_out_size, dtype=lora_config.lora_dtype, device=self.device, ) for _ in range(self.n_slices)) @@ -337,7 +353,6 @@ def set_lora( bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) - if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) @@ -373,7 +388,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): def __init__(self, base_layer: ReplicatedLinear) -> None: super().__init__(base_layer, ) - self.tp_size = 1 #To ensure interface compatibility, it is set to 1 + # To ensure interface compatibility, set to 1 always. + self.tp_size = 1 + self.output_size = self.base_layer.output_size self.n_slices = 1 @@ -412,8 +429,10 @@ def can_replace_layer( class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): """ LoRA on top of ColumnParallelLinear layer. - LoRA B is sliced for tensor parallelism. + There are two types for the `base_layer`: + 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. + 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. """ def __init__(self, base_layer: ColumnParallelLinear) -> None: @@ -425,6 +444,7 @@ def __init__(self, base_layer: ColumnParallelLinear) -> None: base_layer) is MergedColumnParallelLinear self.tp_size = get_tensor_model_parallel_world_size() self.output_size = self.base_layer.output_size_per_partition + # There is only one LoRA layer self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: @@ -513,6 +533,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: MergedColumnParallelLinear) -> None: super().__init__(base_layer) + # There are two LoRA layers self.n_slices = 2 def create_lora_weights( @@ -676,6 +697,7 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: self.base_layer.head_size) self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + # There is only one LoRA layer self.n_slices = 1 def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: @@ -733,6 +755,7 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) + # There are three LoRA layer. self.n_slices = 3 def create_lora_weights( @@ -965,13 +988,16 @@ def __init__(self, base_layer: RowParallelLinear) -> None: # reset input_size self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size + + self.tp_rank = get_tensor_model_parallel_rank() + # There is only one LoRA layer. self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.input_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size lora_a = lora_a[start_idx:end_idx, :] return lora_a @@ -998,10 +1024,9 @@ def forward(self, input_): input_parallel = input_ else: # TODO: simplify code below - tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.base_layer.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() + input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. output_parallel = self.apply(input_parallel) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index f2fed6a485f64..4b06c05a3e828 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -454,7 +454,7 @@ def apply_expand_slice(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], + bias_stacked: Optional[torch.Tensor], y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool = True): @@ -463,8 +463,8 @@ def apply_expand_slice(self, computation, which is suitable for the GEMM of lora'b. """ - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) + if bias_stacked is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_stacked) expand_slice_fun: Callable = (self.expand_slice_prefill if self.is_prefill else @@ -542,16 +542,39 @@ def add_shrink( Otherwise, it is the decode stage, and the shrink_decode function should be called. """ + y_org = y + y = y.view(-1, y.shape[-1]) shrink_fun: Callable = (self.shrink_prefill if self.is_prefill else self.shrink_decode) shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) + + def add_shrink_packed_nslice( + self, + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, + ): + """ + Perform the ` y[i]+=x@w_t_all[i]` computation, which is suitable for + the GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the shrink_decode function + should be called. + """ + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self.add_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale) def add_expand( self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], + bias_stacked: Optional[torch.Tensor], add_input: bool = True, ): """ @@ -562,19 +585,41 @@ def add_expand( Otherwise, it is the decode stage, and the expand_decode function should be called. """ - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) + if bias_stacked is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_stacked) expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) expand_fun(y, x, w_t_all, add_input) - def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_b_stacked: Tuple[torch.Tensor, ...], - bias_stacked: Optional[Tuple[torch.Tensor, - ...]], - scale: float, - output_slices: Tuple[int, ...]) -> None: + def add_expand_slice(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + bias_stacked: Optional[torch.Tensor], + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): + """ + Similar to `add_expand` + """ + if bias_stacked is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_stacked) + + expand_slice_fun: Callable = (self.expand_slice_prefill + if self.is_prefill else + self.expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + + def add_expand_packed_nslice( + self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + add_input=True, + ) -> None: """ Similar to `add_expand` """ @@ -591,17 +636,38 @@ def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, None, offset_left, output_slices[slice_idx], - add_input=True) + add_input=add_input) offset_left += output_slices[slice_idx] y = y.view_as(y_org) + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool = True, + ): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'b or embedding layer's lora. + When `is_prefill` is true, it indicates that it is currently the + prefill stage, and the `expand_prefill` function should be called. + Otherwise, it is the decode stage, and the expand_decode function + should be called. + """ + + # Embedding layer only need expand op + expand_fun: Callable = (self.expand_prefill + if self.is_prefill else self.expand_decode) + expand_fun(y, x, w_t_all, add_input) + def add_lora(self, y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], + bias_stacked: Optional[torch.Tensor], scale: float, y_offset: Optional[int] = None, y_slice_size: Optional[int] = None, @@ -620,7 +686,7 @@ def add_lora(self, x (torch.Tensor): Input tensor wa_t_all (torch.Tensor): lora_a's weight wb_t_all (torch.Tensor): lora_b's weight - bias_all: (torch.Tensor): lora's bias + bias_stacked: (torch.Tensor): lora's bias scale (float): Scaling factor. y_offset (Optional[int], optional): Offset to apply to the starting column of y. @@ -637,11 +703,15 @@ def add_lora(self, buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) + if bias_stacked is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_stacked) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: - self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True) + self.add_expand(y, + buffer, + wb_t_all, + bias_stacked=None, + add_input=True) else: self.apply_expand_slice(y, buffer, @@ -652,32 +722,44 @@ def add_lora(self, add_input=True) y = y.view_as(y_org) - def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - bias_all: Optional[Tuple[torch.Tensor, - ...]], scale: float, - output_slices: Tuple[int, ...]) -> None: + def add_lora_packed_nslice( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None) -> None: """ Applies lora to each input. Similar to add_lora, This method is used for layers that are composed of multiple sublayers (slices) packed together. """ - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - offset_left = 0 - if bias_all is not None: + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if bias_stacked is not None: + assert len(bias_stacked) == len(output_slices) y = self.apply_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_all) - # TODO fuse these kernels - for slice_idx in range(len(output_slices)): - self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], None, scale, offset_left, - output_slices[slice_idx]) - offset_left += output_slices[slice_idx] + output_slices, bias_stacked) - y = y.view_as(y_org) + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = tuple( + torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices))) + self.add_shrink_packed_nslice(buffer, x, lora_a_stacked, scale) + self.add_expand_packed_nslice(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_input=True) def add_lora_logits(self, y: torch.Tensor, From dc5cb0bf6a08fddf78ad9812e86ea414f307f1db Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 09:01:53 +0000 Subject: [PATCH 10/22] Optimize fs_layer.py Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 126 ++++++++++-------------------- 1 file changed, 43 insertions(+), 83 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index c7a13f83f48eb..74b6f34155988 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -31,6 +31,46 @@ def dec(*args, **kwargs): return dec +def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): + """ + For `ColumnParallelLinearWithLoRA` or classes that inherit from + `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. + """ + assert ( + layer.n_slices + == len(layer.lora_a_stacked) + == len(layer.lora_b_stacked) + == len(layer.output_slices) + ) + if layer.bias_stacked is not None: + assert layer.n_slices==len(layer.bias_stacked) + + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + + # Since communication is needed, the buffer is directly initialized as a + # tensor rather than a tuple of tensor. + buffers = torch.zeros( + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + layer.punica_wrapper.add_shrink_packed_nslice(buffers, x, + layer.lora_a_stacked, 1.0) + buffers = tensor_model_parallel_all_gather(buffers) + layer.punica_wrapper.add_expand_packed_nslice(output, + buffers, + layer.lora_b_stacked, + layer.bias_stacked, + layer.output_slices, + add_input=True) + + output = output.view(*out_orig_shape) + # now have column partitioned and packed output + return output # these layers are based on the tensor parallelism strategy given in # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, @@ -59,30 +99,8 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - # TODO add doc - buffer = torch.zeros( - (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - self.punica_wrapper.add_shrink_packed_nslice(buffer, x, - self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand_packed_nslice(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - self.output_slices, - add_input=True) - - # now have column partitioned output - output = output.view(*out_orig_shape) - return output + return _mcp_apply(x, bias, self) + @classmethod @_fully_sharded_can_replace @@ -102,43 +120,6 @@ def can_replace_layer( decorate=False, ) - -def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): - """ - MergedColumnParallelLinearWithShardedLoRA and - MergedQKVParallelLinearWithShardedLora share the same - LoRa weight application method. - - The main difference is the step by shard_size for lora_b which can - vary for MergedQKVParallelLinearWithShardedLora but is constant for - MergedColumnParallelLinearWithShardedLoRA. - """ - # expecting 2 for column parallel and 3 for qkv - assert len(layer.lora_a_stacked) == layer.n_slices - output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffers = torch.zeros( - (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - layer.punica_wrapper.add_shrink_packed_nslice(buffers, x, - layer.lora_a_stacked, 1.0) - buffers = tensor_model_parallel_all_gather(buffers) - layer.punica_wrapper.add_expand_packed_nslice(output, - buffers, - layer.lora_b_stacked, - layer.bias_stacked, - layer.output_slices, - add_input=True) - - output = output.view(*out_orig_shape) - # now have column partitioned and packed output - return output - - class MergedColumnParallelLinearWithShardedLoRA( MergedColumnParallelLinearWithLoRA): """ @@ -204,28 +185,7 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros( - (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - self.punica_wrapper.add_shrink_packed_nslice(buffer, x, - self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand_packed_nslice(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - self.output_slices, - add_input=True) - # now have column partitioned output - output = output.view(*out_orig_shape) - return output + return _mcp_apply(x, bias, self) @classmethod @_fully_sharded_can_replace From 960bb3bd2c97d22d5f30f7dd1cd35f28c717e674 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 10:06:47 +0000 Subject: [PATCH 11/22] Optimize doc Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 18 ++- vllm/lora/layers.py | 9 +- vllm/lora/punica.py | 208 +++++++++++++++--------------- 3 files changed, 118 insertions(+), 117 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 74b6f34155988..038af5d6c3e40 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -31,26 +31,23 @@ def dec(*args, **kwargs): return dec + def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): """ For `ColumnParallelLinearWithLoRA` or classes that inherit from `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. """ - assert ( - layer.n_slices - == len(layer.lora_a_stacked) - == len(layer.lora_b_stacked) - == len(layer.output_slices) - ) + assert (layer.n_slices == len(layer.lora_a_stacked) == len( + layer.lora_b_stacked) == len(layer.output_slices)) if layer.bias_stacked is not None: - assert layer.n_slices==len(layer.bias_stacked) - + assert layer.n_slices == len(layer.bias_stacked) + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - # Since communication is needed, the buffer is directly initialized as a + # Since communication is needed, the buffer is directly initialized as a # tensor rather than a tuple of tensor. buffers = torch.zeros( (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), @@ -72,6 +69,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): # now have column partitioned and packed output return output + # these layers are based on the tensor parallelism strategy given in # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, # https://arxiv.org/abs/2311.03285. @@ -100,7 +98,6 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) - @classmethod @_fully_sharded_can_replace @@ -120,6 +117,7 @@ def can_replace_layer( decorate=False, ) + class MergedColumnParallelLinearWithShardedLoRA( MergedColumnParallelLinearWithLoRA): """ diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 1c63e300b7838..323cc4fbde604 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -376,11 +376,10 @@ def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_packed_nslice(output, x, - self.lora_a_stacked, - self.lora_b_stacked, - self.bias_stacked, 1.0, - self.output_slices) + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.bias_stacked, 1.0, + self.output_slices) return output diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 4b06c05a3e828..9465db16c8892 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -527,7 +527,7 @@ def apply_bias_packed_nslice( return output.view_as(org_output) - def add_shrink( + def apply_shrink( self, y: torch.Tensor, x: torch.Tensor, @@ -549,27 +549,7 @@ def add_shrink( shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) - def add_shrink_packed_nslice( - self, - y: Union[Tuple[torch.Tensor, ...], torch.Tensor], - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - scale: float, - ): - """ - Perform the ` y[i]+=x@w_t_all[i]` computation, which is suitable for - the GEMM of lora'a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function - should be called. - """ - x = x.view(-1, x.shape[-1]) - # TODO fuse these kernels - for slice_idx in range(len(lora_a_stacked)): - self.add_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale) - - def add_expand( + def apply_expand( self, y: torch.Tensor, x: torch.Tensor, @@ -611,6 +591,37 @@ def add_expand_slice(self, self.expand_slice_decode) expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + def add_shrink_packed_nslice( + self, + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, + ): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) + def add_expand_packed_nslice( self, y: torch.Tensor, @@ -621,8 +632,23 @@ def add_expand_packed_nslice( add_input=True, ) -> None: """ - Similar to `add_expand` - """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_input (bool): Defaults to True. + """ y_org = y y = y.view(-1, y.shape[-1]) offset_left = 0 @@ -649,12 +675,17 @@ def add_lora_embedding( add_input: bool = True, ): """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'b or embedding layer's lora. - When `is_prefill` is true, it indicates that it is currently the - prefill stage, and the `expand_prefill` function should be called. - Otherwise, it is the decode stage, and the expand_decode function - should be called. + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ w_t_all + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + w_t_all (torch.Tensor): Transposed weight matrix for all LoRAs. + add_input (bool): Default to True. + """ # Embedding layer only need expand op @@ -662,67 +693,7 @@ def add_lora_embedding( if self.is_prefill else self.expand_decode) expand_fun(y, x, w_t_all, add_input) - def add_lora(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - bias_stacked: Optional[torch.Tensor], - scale: float, - y_offset: Optional[int] = None, - y_slice_size: Optional[int] = None, - *, - buffer: Optional[torch.Tensor] = None) -> None: - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0)+bias[i] - Args: - y (torch.Tensor): Output tensor. Will be changed in-place. - x (torch.Tensor): Input tensor - wa_t_all (torch.Tensor): lora_a's weight - wb_t_all (torch.Tensor): lora_b's weight - bias_stacked: (torch.Tensor): lora's bias - scale (float): Scaling factor. - y_offset (Optional[int], optional): Offset to apply to the starting - column of y. - y_slice_size (Optional[int], optional): Size of the y column slice. - buffer (Optional[torch.Tensor], optional): Defaults to None. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - if bias_stacked is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_stacked) - self.add_shrink(buffer, x, wa_t_all, scale) - if y_offset is None and y_slice_size is None: - self.add_expand(y, - buffer, - wb_t_all, - bias_stacked=None, - add_input=True) - else: - self.apply_expand_slice(y, - buffer, - wb_t_all, - None, - y_offset, - y_slice_size, - add_input=True) - y = y.view_as(y_org) - - def add_lora_packed_nslice( + def add_lora_linear( self, y: torch.Tensor, x: torch.Tensor, @@ -734,9 +705,26 @@ def add_lora_packed_nslice( *, buffer: Optional[Tuple[torch.Tensor, ...]] = None) -> None: """ - Applies lora to each input. Similar to add_lora, This method is - used for layers that are composed of multiple sublayers - (slices) packed together. + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+bias_stacked[i] + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) @@ -764,25 +752,41 @@ def add_lora_packed_nslice( def add_lora_logits(self, y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, scale, *, buffer: Optional[torch.Tensor] = None) -> None: """ - LogitsProcessorWithLoRA always using bgmv - """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ y_org = y y = y.view(-1, y.shape[-1]) x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) + r = lora_b_stacked.size(-1) if buffer is None: # We set the buffer to be float32 by default ,refer to: # https://github.com/triton-lang/triton/issues/1387 buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - - bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) - bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, + lora_b_stacked, + y, + self.sampler_indices, + add_inputs=True) y = y.view_as(y_org) From 61a8085941eea5f0598f4ee350e1800158254744 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 10:53:27 +0000 Subject: [PATCH 12/22] Optimize interface Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 6 ++---- vllm/lora/punica.py | 34 ++++++++----------------------- 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 038af5d6c3e40..365d1c57c404b 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -302,12 +302,10 @@ def apply(self, # TODO:add DOC buffer = buffer.squeeze(dim=0) - shard_size = self.lora_b_stacked[0].shape[2] - start_idx = self.tp_rank * shard_size - self.punica_wrapper.add_expand_slice( + self.punica_wrapper.add_expand_fs_rowlinear( output, buffer, self.lora_b_stacked[0], self.bias_stacked[0] if self.bias_stacked is not None else None, - start_idx, shard_size) + add_input=True) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 9465db16c8892..8bc6500dd0cbe 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -549,11 +549,11 @@ def apply_shrink( shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) - def apply_expand( + def add_expand_fs_rowlinear( self, y: torch.Tensor, x: torch.Tensor, - w_t_all: torch.Tensor, + lora_b_stacked: torch.Tensor, bias_stacked: Optional[torch.Tensor], add_input: bool = True, ): @@ -570,26 +570,9 @@ def apply_expand( expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) - expand_fun(y, x, w_t_all, add_input) - - def add_expand_slice(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - bias_stacked: Optional[torch.Tensor], - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): - """ - Similar to `add_expand` - """ - if bias_stacked is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_stacked) + expand_fun(y, x, lora_b_stacked, add_input) + - expand_slice_fun: Callable = (self.expand_slice_prefill - if self.is_prefill else - self.expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) def add_shrink_packed_nslice( self, @@ -664,26 +647,25 @@ def add_expand_packed_nslice( output_slices[slice_idx], add_input=add_input) offset_left += output_slices[slice_idx] - y = y.view_as(y_org) def add_lora_embedding( self, y: torch.Tensor, x: torch.Tensor, - w_t_all: torch.Tensor, + lora_b_stacked: torch.Tensor, add_input: bool = True, ): """ Applies lora specifically for VocabParallelEmbeddingWithLoRA. Semantics: - y += x @ w_t_all + y += x @ lora_b_stacked Args: y (torch.Tensor): Output tensor. x (torch.Tensor): Input tensor. - w_t_all (torch.Tensor): Transposed weight matrix for all LoRAs. + lora_b_stacked (torch.Tensor): lora_b's weights. add_input (bool): Default to True. """ @@ -691,7 +673,7 @@ def add_lora_embedding( # Embedding layer only need expand op expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) - expand_fun(y, x, w_t_all, add_input) + expand_fun(y, x, lora_b_stacked, add_input) def add_lora_linear( self, From 4ab1c33ce4b90e01603a16da6aa74f58d8f2e7be Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 15:48:15 +0000 Subject: [PATCH 13/22] Add unit test Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 27 ++++++++++++++++++++------- vllm/lora/fully_sharded_layers.py | 23 +++++++++++------------ vllm/lora/layers.py | 3 ++- vllm/lora/punica.py | 20 +++++++++----------- 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 15e576cb065c7..c1de857821e23 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -669,8 +669,9 @@ def create_random_linear_replicated_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -679,7 +680,8 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_parallel_layer(): if orientation == "row": @@ -700,7 +702,12 @@ def create_random_linear_parallel_layer(): if not fully_shard else ColumnParallelLinearWithShardedLoRA(linear)) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked)) + if bias_enabled: + assert len(lora_linear.bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.bias_stacked is None return linear, lora_linear for i in range(10): @@ -784,8 +791,9 @@ def create_random_linear_parallel_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -794,7 +802,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_column_parallel_packed_layer(): if repeats == 2: @@ -835,7 +844,12 @@ class FakeConfig: lora_linear.create_lora_weights(max_loras, lora_config, model_config=FakeConfig()) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked)) + if bias_enabled: + assert len(lora_linear.bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.bias_stacked is None return linear, lora_linear for i in range(10): @@ -911,7 +925,6 @@ class FakeConfig: 512, lora_config.lora_extra_vocab_size, ) - # lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 365d1c57c404b..0d13914d99136 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -55,15 +55,14 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): device=x.device, ) - layer.punica_wrapper.add_shrink_packed_nslice(buffers, x, - layer.lora_a_stacked, 1.0) + layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0) buffers = tensor_model_parallel_all_gather(buffers) - layer.punica_wrapper.add_expand_packed_nslice(output, - buffers, - layer.lora_b_stacked, - layer.bias_stacked, - layer.output_slices, - add_input=True) + layer.punica_wrapper.add_expand(output, + buffers, + layer.lora_b_stacked, + layer.bias_stacked, + layer.output_slices, + add_input=True) output = output.view(*out_orig_shape) # now have column partitioned and packed output @@ -289,8 +288,7 @@ def apply(self, device=x.device, ) - self.punica_wrapper.add_shrink_packed_nslice(buffer, x, - self.lora_a_stacked, 1.0) + self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) buffer = tensor_model_parallel_all_reduce(buffer) # following S-LoRA, allows the fusing of all_gather and all_reduce @@ -300,10 +298,11 @@ def apply(self, # the output is not the same as a normal row_parallel, it should be # reduced before being used - # TODO:add DOC buffer = buffer.squeeze(dim=0) self.punica_wrapper.add_expand_fs_rowlinear( - output, buffer, self.lora_b_stacked[0], + output, + buffer, + self.lora_b_stacked[0], self.bias_stacked[0] if self.bias_stacked is not None else None, add_input=True) output = output.view(*out_orig_shape) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 323cc4fbde604..c1f03675ff4f0 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -278,6 +278,8 @@ def __init__(self, base_layer: LinearBase): self.output_slices: Tuple[int, ...] self.tp_size: int + self.output_size: int + self.n_slices: int def create_lora_weights( self, @@ -389,7 +391,6 @@ def __init__(self, base_layer: ReplicatedLinear) -> None: super().__init__(base_layer, ) # To ensure interface compatibility, set to 1 always. self.tp_size = 1 - self.output_size = self.base_layer.output_size self.n_slices = 1 diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 8bc6500dd0cbe..fc79e957c6cc6 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -572,9 +572,7 @@ def add_expand_fs_rowlinear( if self.is_prefill else self.expand_decode) expand_fun(y, x, lora_b_stacked, add_input) - - - def add_shrink_packed_nslice( + def add_shrink( self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, @@ -605,7 +603,7 @@ def add_shrink_packed_nslice( self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], scale) - def add_expand_packed_nslice( + def add_expand( self, y: torch.Tensor, x: Union[Tuple[torch.Tensor, ...], torch.Tensor], @@ -723,13 +721,13 @@ def add_lora_linear( torch.zeros( (x.size(0), r), dtype=torch.float32, device=x.device) for _ in range(len(output_slices))) - self.add_shrink_packed_nslice(buffer, x, lora_a_stacked, scale) - self.add_expand_packed_nslice(y, - buffer, - lora_b_stacked, - None, - output_slices, - add_input=True) + self.add_shrink(buffer, x, lora_a_stacked, scale) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_input=True) def add_lora_logits(self, y: torch.Tensor, From 3c2192cb966cc4fd3ea7580edab84458736c94d7 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 4 Dec 2024 16:30:44 +0000 Subject: [PATCH 14/22] Done Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 14 ++++- vllm/lora/fully_sharded_layers.py | 17 ++++-- vllm/lora/layers.py | 5 +- vllm/lora/punica.py | 92 ++++++++----------------------- 4 files changed, 47 insertions(+), 81 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index c1de857821e23..4e4988b07564a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -565,7 +565,9 @@ def _pretest(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) -def test_linear_replicated(dist_init, num_loras, device, stage) -> None: +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_replicated(dist_init, num_loras, device, stage, + bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -573,7 +575,8 @@ def test_linear_replicated(dist_init, num_loras, device, stage) -> None: max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_replicated_layer(): @@ -585,7 +588,12 @@ def create_random_linear_replicated_layer(): lora_linear = ReplicatedLinearWithLoRA(linear) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked)) + if bias_enabled: + assert len(lora_linear.bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.bias_stacked is None return linear, lora_linear for i in range(10): diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 0d13914d99136..a0e2dd3d8e5cf 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -62,6 +62,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): layer.lora_b_stacked, layer.bias_stacked, layer.output_slices, + offset_start=0, add_input=True) output = output.view(*out_orig_shape) @@ -297,14 +298,18 @@ def apply(self, # remains is a standard all_reduce. User should be aware though that # the output is not the same as a normal row_parallel, it should be # reduced before being used - - buffer = buffer.squeeze(dim=0) - self.punica_wrapper.add_expand_fs_rowlinear( + # NOTE offset are based on the rank. + shard_size = self.lora_b_stacked[0].shape[2] + offset_start = self.tp_rank * shard_size + self.punica_wrapper.add_expand( output, buffer, - self.lora_b_stacked[0], - self.bias_stacked[0] if self.bias_stacked is not None else None, - add_input=True) + self.lora_b_stacked, + self.bias_stacked, + self.output_slices, + offset_start=offset_start, + add_input=True, + ) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index c1f03675ff4f0..a6c2c5bd012cb 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -288,7 +288,7 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - + # if isinstance(self.base_layer, ReplicatedLinear): lora_a_out_size = lora_config.max_lora_rank lora_b_out_size = self.output_size @@ -307,7 +307,6 @@ def create_lora_weights( else: raise NotImplementedError - lora_bias_out_size = self.output_size self.lora_a_stacked = tuple( torch.zeros( max_loras, @@ -327,6 +326,7 @@ def create_lora_weights( device=self.device, ) for _ in range(self.n_slices)) if lora_config.bias_enabled: + lora_bias_out_size = lora_b_out_size self.bias_stacked = tuple( torch.zeros( max_loras, @@ -342,6 +342,7 @@ def reset_lora(self, index: int): self.lora_a_stacked[s_index][index] = 0 self.lora_b_stacked[s_index][index] = 0 if self.lora_config.bias_enabled: + # Make mypy happy self.bias_stacked = cast(Tuple[torch.Tensor, ...], self.bias_stacked) self.bias_stacked[s_index][index] = 0 diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index fc79e957c6cc6..1f503b763d614 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -450,21 +450,18 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) - def apply_expand_slice(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - bias_stacked: Optional[torch.Tensor], - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): + def apply_expand(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): """ - Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all+bias` + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` computation, which is suitable for the GEMM of lora'b. """ - if bias_stacked is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_stacked) expand_slice_fun: Callable = (self.expand_slice_prefill if self.is_prefill else @@ -472,30 +469,6 @@ def apply_expand_slice(self, expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) def apply_bias( - self, - indices: torch.Tensor, - output: torch.Tensor, - bias_stacked: torch.Tensor, - ): - """Applies bias to output - - Input shapes: - bias_stacked: (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) - bias_stacked = bias_stacked[indices] - bias_stacked[indices == -1] = 0 - output += bias_stacked - - return output.view_as(org_output) - - def apply_bias_packed_nslice( self, indices: torch.Tensor, output: torch.Tensor, @@ -549,29 +522,6 @@ def apply_shrink( shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) - def add_expand_fs_rowlinear( - self, - y: torch.Tensor, - x: torch.Tensor, - lora_b_stacked: torch.Tensor, - bias_stacked: Optional[torch.Tensor], - add_input: bool = True, - ): - """ - Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the - GEMM of lora'b. - When `is_prefill` is true, it indicates that it is currently the - prefill stage, and the `expand_prefill` function should be called. - Otherwise, it is the decode stage, and the expand_decode function - should be called. - """ - if bias_stacked is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_stacked) - - expand_fun: Callable = (self.expand_prefill - if self.is_prefill else self.expand_decode) - expand_fun(y, x, lora_b_stacked, add_input) - def add_shrink( self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], @@ -610,6 +560,7 @@ def add_expand( lora_b_stacked: Tuple[torch.Tensor, ...], bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], + offset_start: int = 0, add_input=True, ) -> None: """ @@ -632,18 +583,19 @@ def add_expand( """ y_org = y y = y.view(-1, y.shape[-1]) - offset_left = 0 + offset_left = offset_start if bias_stacked is not None: - self.apply_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_stacked) + self.apply_bias(self.token_lora_indices, y, output_slices, + bias_stacked) for slice_idx in range(len(lora_b_stacked)): - self.apply_expand_slice(y, - x[slice_idx], - lora_b_stacked[slice_idx], - None, - offset_left, - output_slices[slice_idx], - add_input=add_input) + self.apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_input=add_input, + ) offset_left += output_slices[slice_idx] y = y.view_as(y_org) @@ -710,8 +662,8 @@ def add_lora_linear( assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) if bias_stacked is not None: assert len(bias_stacked) == len(output_slices) - y = self.apply_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_stacked) + y = self.apply_bias(self.token_lora_indices, y, output_slices, + bias_stacked) if buffer is None: r = lora_b_stacked[0].size(-1) From bb60e25733674a0e59d3cbcd31bb98ca81ef8dc2 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 03:51:09 +0000 Subject: [PATCH 15/22] Optimize function name Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 12 ++-- vllm/lora/fully_sharded_layers.py | 13 ++-- vllm/lora/layers.py | 113 +++++++++++++++++------------- vllm/lora/punica.py | 29 ++++---- 4 files changed, 94 insertions(+), 73 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 4e4988b07564a..06192e9678ba7 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -591,9 +591,9 @@ def create_random_linear_replicated_layer(): assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked)) if bias_enabled: - assert len(lora_linear.bias_stacked) == lora_linear.n_slices + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: - assert lora_linear.bias_stacked is None + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -713,9 +713,9 @@ def create_random_linear_parallel_layer(): assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked)) if bias_enabled: - assert len(lora_linear.bias_stacked) == lora_linear.n_slices + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: - assert lora_linear.bias_stacked is None + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -855,9 +855,9 @@ class FakeConfig: assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( lora_linear.lora_b_stacked)) if bias_enabled: - assert len(lora_linear.bias_stacked) == lora_linear.n_slices + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: - assert lora_linear.bias_stacked is None + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index a0e2dd3d8e5cf..545ec21ca74c1 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -39,8 +39,8 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): """ assert (layer.n_slices == len(layer.lora_a_stacked) == len( layer.lora_b_stacked) == len(layer.output_slices)) - if layer.bias_stacked is not None: - assert layer.n_slices == len(layer.bias_stacked) + if layer.lora_bias_stacked is not None: + assert layer.n_slices == len(layer.lora_bias_stacked) output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) @@ -60,7 +60,7 @@ def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): layer.punica_wrapper.add_expand(output, buffers, layer.lora_b_stacked, - layer.bias_stacked, + layer.lora_bias_stacked, layer.output_slices, offset_start=0, add_input=True) @@ -268,8 +268,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - self.bias_stacked = cast(Tuple[torch.Tensor, ...], self.bias_stacked) - shard_size = self.bias_stacked[0].shape[2] + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + shard_size = self.lora_bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size bias = bias[start_idx:end_idx] @@ -305,7 +306,7 @@ def apply(self, output, buffer, self.lora_b_stacked, - self.bias_stacked, + self.lora_bias_stacked, self.output_slices, offset_start=offset_start, add_input=True, diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a6c2c5bd012cb..3ff96832eb5a5 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -274,7 +274,7 @@ def __init__(self, base_layer: LinearBase): self.base_layer = base_layer self.input_size = self.base_layer.input_size self.device = _get_lora_device(self.base_layer) - self.bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None self.output_slices: Tuple[int, ...] self.tp_size: int @@ -327,7 +327,7 @@ def create_lora_weights( ) for _ in range(self.n_slices)) if lora_config.bias_enabled: lora_bias_out_size = lora_b_out_size - self.bias_stacked = tuple( + self.lora_bias_stacked = tuple( torch.zeros( max_loras, 1, @@ -343,9 +343,9 @@ def reset_lora(self, index: int): self.lora_b_stacked[s_index][index] = 0 if self.lora_config.bias_enabled: # Make mypy happy - self.bias_stacked = cast(Tuple[torch.Tensor, ...], - self.bias_stacked) - self.bias_stacked[s_index][index] = 0 + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[s_index][index] = 0 def set_lora( self, @@ -353,14 +353,20 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): + # Except for QKVParallelLinearWithLora and + # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers + # have a tuple size of 1. These two layers will override this function. + assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == + self.n_slices == 1) + self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) self.lora_a_stacked[0][index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -368,12 +374,13 @@ def set_lora( self.lora_b_stacked[0][index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked = cast(Tuple[torch.Tensor, ...], - self.bias_stacked) - self.bias_stacked[0][index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) + if lora_bias is not None: + + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( + lora_bias.T, non_blocking=True) def apply(self, x: torch.Tensor, @@ -381,7 +388,7 @@ def apply(self, output = self.base_layer.quant_method.apply(self.base_layer, x, bias) self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, self.lora_b_stacked, - self.bias_stacked, 1.0, + self.lora_bias_stacked, 1.0, self.output_slices) return output @@ -543,6 +550,10 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: + """ + The main reason for overriding this function is to enhance code + maintainability. + """ self.lora_config = lora_config if not (len(self.base_layer.output_sizes) == self.n_slices @@ -577,7 +588,7 @@ def create_lora_weights( device=self.device, ) for _ in range(self.n_slices)) if lora_config.bias_enabled: - self.bias_stacked = tuple( + self.lora_bias_stacked = tuple( torch.zeros( max_loras, 1, @@ -625,15 +636,15 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) if lora_a[0] is not None: self.lora_a_stacked[0][ @@ -642,12 +653,11 @@ def set_lora( self.lora_b_stacked[0][ index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) - if bias is not None and bias[0] is not None: - self.bias_stacked = cast(Tuple[torch.Tensor, ...], - self.bias_stacked) - self.bias_stacked[0][index, - 0, :bias[0].shape[0]].copy_(bias[0].T, - non_blocking=True) + if lora_bias is not None and lora_bias[0] is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias[0].shape[0]].copy_( + lora_bias[0].T, non_blocking=True) if lora_a[1] is not None: self.lora_a_stacked[1][ index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( @@ -655,12 +665,11 @@ def set_lora( self.lora_b_stacked[1][ index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) - if bias is not None and bias[1] is not None: - self.bias_stacked = cast(Tuple[torch.Tensor, ...], - self.bias_stacked) - self.bias_stacked[1][index, - 0, :bias[1].shape[0]].copy_(bias[1].T, - non_blocking=True) + if lora_bias is not None and lora_bias[1] is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[1][index, 0, :lora_bias[1].shape[0]].copy_( + lora_bias[1].T, non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -765,6 +774,10 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: + """ + The main reason for overloading this function is to handle inconsistent + weight dimensions in qkv lora. + """ self.lora_config = lora_config self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() @@ -832,7 +845,7 @@ def create_lora_weights( ), ) if lora_config.bias_enabled: - self.bias_stacked = ( + self.lora_bias_stacked = ( torch.zeros( max_loras, 1, @@ -915,15 +928,15 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) if lora_b[0] is not None: lora_b_q = lora_b[0] @@ -954,18 +967,24 @@ def set_lora( index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( lora_a[2].T, non_blocking=True) - if bias is not None: - self.bias_stacked = cast(Tuple[torch.Tensor, ...], - self.bias_stacked) - if bias[0] is not None: - self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( - bias[0].T, non_blocking=True) - if bias[1] is not None: - self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( - bias[1].T, non_blocking=True) - if bias[2] is not None: - self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( - bias[2].T, non_blocking=True) + if lora_bias is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + if lora_bias[0] is not None: + self.lora_bias_stacked[0][index, + 0, :lora_bias[0].shape[0]].copy_( + lora_bias[0].T, + non_blocking=True) + if lora_bias[1] is not None: + self.lora_bias_stacked[1][index, + 0, :lora_bias[1].shape[0]].copy_( + lora_bias[1].T, + non_blocking=True) + if lora_bias[2] is not None: + self.lora_bias_stacked[2][index, + 0, :lora_bias[2].shape[0]].copy_( + lora_bias[2].T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 1f503b763d614..257890a1853b9 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -473,12 +473,12 @@ def apply_bias( indices: torch.Tensor, output: torch.Tensor, output_slices: Tuple[int, ...], - bias_stacked: Tuple[Optional[torch.Tensor], ...], + lora_bias_stacked: Tuple[Optional[torch.Tensor], ...], ): """Applies bias to output Input shapes: - bias_stacked: 3 element tuple of (num_loras, output_dim) + lora_bias_stacked: 3 element tuple of (num_loras, output_dim) indices: (batch_size) output: (batch_size, q_slice_size + 2*kv_slice_size) output_slices: n-1 element tuple of (slice_size...), @@ -490,7 +490,7 @@ def apply_bias( offset_left = 0 for slice_idx, slice in enumerate(output_slices): - bias = bias_stacked[slice_idx] + bias = lora_bias_stacked[slice_idx] if bias is not None: bias = bias.view(-1, bias.shape[-1]) bias = bias[indices] @@ -558,7 +558,7 @@ def add_expand( y: torch.Tensor, x: Union[Tuple[torch.Tensor, ...], torch.Tensor], lora_b_stacked: Tuple[torch.Tensor, ...], - bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], output_slices: Tuple[int, ...], offset_start: int = 0, add_input=True, @@ -570,23 +570,24 @@ def add_expand( for i in range(len(lora_b_stacked)): slice = output_slices[i] y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + - bias_stacked[i] + lora_bias_stacked[i] offset += slice Args: y (torch.Tensor): Output tensor. x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - bias_stacked (Optional[Tuple[torch.Tensor, ...]]): bias's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight output_slices (Tuple[int, ...]): Every slice's size add_input (bool): Defaults to True. """ y_org = y y = y.view(-1, y.shape[-1]) offset_left = offset_start - if bias_stacked is not None: + if lora_bias_stacked is not None: self.apply_bias(self.token_lora_indices, y, output_slices, - bias_stacked) + lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): self.apply_expand( y, @@ -631,7 +632,7 @@ def add_lora_linear( x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], lora_b_stacked: Tuple[torch.Tensor, ...], - bias_stacked: Optional[Tuple[torch.Tensor, ...]], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], scale: float, output_slices: Tuple[int, ...], *, @@ -646,24 +647,24 @@ def add_lora_linear( @ lora_a_stacked[indices[i], layer_idx, :, :] @ lora_b_stacked[indices[i], layer_idx, :, :] * scale - ).squeeze(0)+bias_stacked[i] + ).squeeze(0)+lora_bias_stacked[i] Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. - bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. output_slices (Tuple[int, ...]): Every slice's size. buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) - if bias_stacked is not None: - assert len(bias_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) y = self.apply_bias(self.token_lora_indices, y, output_slices, - bias_stacked) + lora_bias_stacked) if buffer is None: r = lora_b_stacked[0].size(-1) From b61da95a154e52cdf98b364b9bdbf34c3a76790c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 05:24:57 +0000 Subject: [PATCH 16/22] Optimize doc Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3ff96832eb5a5..ed751bf2635d3 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -357,7 +357,8 @@ def set_lora( ): # Except for QKVParallelLinearWithLora and # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers - # have a tuple size of 1. These two layers will override this function. + # store weights in a tuple of size 1. These two layers will + # override this function. assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1) From 0a6b01c34abafdc139e2655f751f1a827cc4daa8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 06:30:10 +0000 Subject: [PATCH 17/22] format code Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ed751bf2635d3..a466e3a732aa7 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -18,12 +18,9 @@ tensor_model_parallel_gather) from vllm.distributed.utils import divide from vllm.lora.punica import PunicaWrapper -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, - MergedColumnParallelLinear, - QKVParallelLinear, - ReplicatedLinear, - RowParallelLinear) +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, LinearBase, MergedColumnParallelLinear, + QKVParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) From e440859f40551bcf7fee1413e52684370efefd23 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 06:38:10 +0000 Subject: [PATCH 18/22] format code Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index a466e3a732aa7..513af27973df9 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -18,9 +18,14 @@ tensor_model_parallel_gather) from vllm.distributed.utils import divide from vllm.lora.punica import PunicaWrapper -from vllm.model_executor.layers.linear import ( - ColumnParallelLinear, LinearBase, MergedColumnParallelLinear, - QKVParallelLinear, ReplicatedLinear, RowParallelLinear) +# yapf: enable +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearBase, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +# yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) From 7b01f48bab447cdff402944f731e7a3b1aa84df6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 06:41:38 +0000 Subject: [PATCH 19/22] fix typo Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 513af27973df9..c32d69162dd75 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -18,7 +18,7 @@ tensor_model_parallel_gather) from vllm.distributed.utils import divide from vllm.lora.punica import PunicaWrapper -# yapf: enable +# yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, LinearBase, MergedColumnParallelLinear, From da2256df0f3802b5878f41998a24c7954505bc72 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 06:48:52 +0000 Subject: [PATCH 20/22] fix typo Signed-off-by: Jee Jee Li --- vllm/lora/layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index c32d69162dd75..ec179eca5f91e 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -25,7 +25,7 @@ QKVParallelLinear, ReplicatedLinear, RowParallelLinear) -# yapf: enable +# yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) From a265f7a8fa17aeb8e90a6318714c33f602c7f7ec Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 08:00:46 +0000 Subject: [PATCH 21/22] Modify function name Signed-off-by: Jee Jee Li --- vllm/lora/punica.py | 64 ++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 257890a1853b9..563d1181d6fcb 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -362,7 +362,7 @@ def long_lora_indices(self) -> torch.Tensor: long_lora_len = self.indices_len[4] return self._long_lora_indices[:long_lora_len] - def shrink_prefill( + def _shrink_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -380,7 +380,7 @@ def shrink_prefill( scale, ) - def shrink_decode( + def _shrink_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -389,7 +389,7 @@ def shrink_decode( ): bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) - def expand_prefill( + def _expand_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -407,7 +407,7 @@ def expand_prefill( add_input, ) - def expand_decode( + def _expand_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -416,7 +416,7 @@ def expand_decode( ): bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) - def expand_slice_prefill( + def _expand_slice_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -438,7 +438,7 @@ def expand_slice_prefill( add_input, ) - def expand_slice_decode( + def _expand_slice_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -450,25 +450,25 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) - def apply_expand(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): + def _apply_expand(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): """ Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` computation, which is suitable for the GEMM of lora'b. """ - expand_slice_fun: Callable = (self.expand_slice_prefill + expand_slice_fun: Callable = (self._expand_slice_prefill if self.is_prefill else - self.expand_slice_decode) + self._expand_slice_decode) expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) - def apply_bias( + def _apply_bias( self, indices: torch.Tensor, output: torch.Tensor, @@ -500,7 +500,7 @@ def apply_bias( return output.view_as(org_output) - def apply_shrink( + def _apply_shrink( self, y: torch.Tensor, x: torch.Tensor, @@ -511,14 +511,14 @@ def apply_shrink( Perform the ` y+=x@w_t_all` computation, which is suitable for the GEMM of lora'a. When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function should be called. """ y_org = y y = y.view(-1, y.shape[-1]) - shrink_fun: Callable = (self.shrink_prefill - if self.is_prefill else self.shrink_decode) + shrink_fun: Callable = (self._shrink_prefill + if self.is_prefill else self._shrink_decode) shrink_fun(y, x, w_t_all, scale) y = y.view_as(y_org) @@ -532,8 +532,8 @@ def add_shrink( """ Performs GEMM for multiple slices of lora_a. When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function should be called. Semantics: @@ -550,8 +550,8 @@ def add_shrink( x = x.view(-1, x.shape[-1]) # TODO fuse these kernels for slice_idx in range(len(lora_a_stacked)): - self.apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], - scale) + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) def add_expand( self, @@ -586,10 +586,10 @@ def add_expand( y = y.view(-1, y.shape[-1]) offset_left = offset_start if lora_bias_stacked is not None: - self.apply_bias(self.token_lora_indices, y, output_slices, - lora_bias_stacked) + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): - self.apply_expand( + self._apply_expand( y, x[slice_idx], lora_b_stacked[slice_idx], @@ -622,8 +622,8 @@ def add_lora_embedding( """ # Embedding layer only need expand op - expand_fun: Callable = (self.expand_prefill - if self.is_prefill else self.expand_decode) + expand_fun: Callable = (self._expand_prefill + if self.is_prefill else self._expand_decode) expand_fun(y, x, lora_b_stacked, add_input) def add_lora_linear( @@ -663,8 +663,8 @@ def add_lora_linear( assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) if lora_bias_stacked is not None: assert len(lora_bias_stacked) == len(output_slices) - y = self.apply_bias(self.token_lora_indices, y, output_slices, - lora_bias_stacked) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) if buffer is None: r = lora_b_stacked[0].size(-1) From 2f02dda5aa1b6156f3eeedf01a4246a8225631a6 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 09:03:29 +0000 Subject: [PATCH 22/22] Modify nslices Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 7 ++++--- vllm/lora/layers.py | 15 ++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 06192e9678ba7..a113e3f7abc1e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -589,7 +589,7 @@ def create_random_linear_replicated_layer(): lora_linear.create_lora_weights(max_loras, lora_config) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( - lora_linear.lora_b_stacked)) + lora_linear.lora_b_stacked) == 1) if bias_enabled: assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: @@ -711,7 +711,7 @@ def create_random_linear_parallel_layer(): ColumnParallelLinearWithShardedLoRA(linear)) lora_linear.create_lora_weights(max_loras, lora_config) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( - lora_linear.lora_b_stacked)) + lora_linear.lora_b_stacked) == 1) if bias_enabled: assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: @@ -849,11 +849,12 @@ class FakeConfig: num_key_value_heads = 32 num_attention_heads = 32 + n_slices = repeats lora_linear.create_lora_weights(max_loras, lora_config, model_config=FakeConfig()) assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( - lora_linear.lora_b_stacked)) + lora_linear.lora_b_stacked) == n_slices) if bias_enabled: assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices else: diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index ec179eca5f91e..473e4bedf3d60 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -545,7 +545,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: MergedColumnParallelLinear) -> None: super().__init__(base_layer) # There are two LoRA layers - self.n_slices = 2 + self.n_slices = len(self.base_layer.output_sizes) def create_lora_weights( self, @@ -559,7 +559,7 @@ def create_lora_weights( """ self.lora_config = lora_config - if not (len(self.base_layer.output_sizes) == self.n_slices + if not (len(self.base_layer.output_sizes) == self.n_slices == 2 and self.base_layer.output_sizes[0] == self.base_layer.output_sizes[1]): raise ValueError( @@ -769,7 +769,9 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) # There are three LoRA layer. - self.n_slices = 3 + self.n_slices = len(self.base_layer.output_sizes) + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() def create_lora_weights( self, @@ -782,8 +784,11 @@ def create_lora_weights( weight dimensions in qkv lora. """ self.lora_config = lora_config - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() + + if not (len(self.base_layer.output_sizes) == self.n_slices == 3): + raise ValueError( + "LoRAColumnParallelLinear3Slice requires 3 slices.") + self.q_proj_shard_size = (self.base_layer.num_heads * self.base_layer.head_size) self.kv_proj_shard_size = (self.base_layer.num_kv_heads *