diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 89f0d0406d..f3359ca9a8 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -221,7 +221,7 @@ def pissa_init(self, adapter_name, init_lora_weights): "Please initialize PiSSA under float32, float16, or bfloat16. " "Subsequently, re-quantize the residual model to help minimize quantization errors." ) - weight = weight.to(torch.float32) + weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) if init_lora_weights == "pissa": # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel}, V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False) @@ -245,7 +245,7 @@ def pissa_init(self, adapter_name, init_lora_weights): self.lora_A[adapter_name].weight.data = lora_A self.lora_B[adapter_name].weight.data = lora_B weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A - weight = weight.to(dtype) + weight = transpose(weight.to(dtype), self.fan_in_fan_out) self.get_base_layer().weight.data = weight def loftq_init(self, adapter_name): diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index aa3a6e5005..354ef0ff12 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -46,6 +46,7 @@ WhisperProcessor, WhisperTokenizer, ) +from transformers.pytorch_utils import Conv1D from peft import ( AdaLoraConfig, @@ -1718,7 +1719,7 @@ def quantize_model(self, model, num_bits=4, device="cuda"): # Quantize the `weight.data` of the linear layer in the model to `num_bits` and store it with full precision. quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64) for name, module in model.named_modules(): - if isinstance(module, torch.nn.Linear) and "lm_head" not in name: + if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name: quantized_weight, max_abs, shape = quantizer.quantize_block(module.weight.data.to(device)) module.weight.data = quantizer.dequantize_block(quantized_weight, max_abs, shape) return model @@ -1727,7 +1728,7 @@ def nuclear_norm(self, base_model, quantized_model): # Calculate the nuclear norm (sum of singular values) of the error matrices between the `quantized_model` and the `base_model`. error_list = [] for name, module in base_model.named_modules(): - if isinstance(module, torch.nn.Linear) and "lm_head" not in name: + if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name: quant_module = quantized_model.get_submodule(name) error_list.append(torch.linalg.svdvals(module.weight.data - quant_module.weight.data).sum()) return torch.Tensor(error_list).sum() @@ -1821,6 +1822,16 @@ def test_t5_pissa_4bit(self, device, tmp_path): def test_t5_pissa_8bit(self, device, tmp_path): self.get_errors(bits=8, device=device, model_id="t5-small", tmp_path=tmp_path) + @pytest.mark.parametrize("device", ["cuda", "cpu"]) + def test_gpt2_pissa_4bit(self, device, tmp_path): + # see 2104 + self.get_errors(bits=4, device=device, model_id="gpt2", tmp_path=tmp_path) + + @pytest.mark.parametrize("device", ["cuda", "cpu"]) + def test_gpt2_pissa_8bit(self, device, tmp_path): + # see 2104 + self.get_errors(bits=8, device=device, model_id="gpt2", tmp_path=tmp_path) + @require_bitsandbytes def test_lora_pissa_conversion_same_output_after_loading_with_quantization(self, tmp_path): # A copy of the test `test_lora_pissa_conversion_same_output_after_loading` in peft/tests/test_initialization.py,