-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathQSTConfig.py
131 lines (112 loc) · 4.28 KB
/
QSTConfig.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, PretrainedConfig, \
AutoModelForSequenceClassification
import math
from typing import List, Optional, Tuple, Union
import torch.nn.functional as F
from transformers.activations import get_activation
import bitsandbytes as bnb
from dataclasses import dataclass, field
from transformers.modeling_outputs import SequenceClassifierOutputWithPast, CausalLMOutputWithPast, \
BaseModelOutputWithPast
@dataclass
class QSTConfig(PretrainedConfig):
add_layer_norm_before_adapter: bool = False
add_layer_norm_after_adapter: bool = True
activation: str = ""
r: int = 0
alpha_r: int = 0
dropout: float = 0.0
activation: str = ""
fan_in_fan_out: bool = False,
peft_hidden_size: int = 64,
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
classifer_params = 0
for name, param in model.named_parameters():
# print(name)
# print(f'Layer: {_} | Device: {param.device}')
if "score" in name:
# print(name)
classifer_params += param.numel()
param.requires_grad = False
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
for name, param in model.LLM.named_parameters():
if "classifier" in name:
classifer_params += param.numel()
param.requires_grad = True
all_param += param.numel()
if param.requires_grad:
# print(name)
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
print(f"classifer:{classifer_params}")
def _get_submodules(model, key):
parent = model.get_submodule(".".join(key.split(".")[:-1]))
target_name = key.split(".")[-1]
target = model.get_submodule(key)
return parent, target, target_name
class Activations(nn.Module):
def __init__(self, activation_type):
super().__init__()
self.f = get_activation(activation_type)
def forward(self, x):
return self.f(x)
class AdapterLinear(nn.Module):
# # Adapter
def __init__(self,
in_features: int,
out_features: int,
r: int,
alpha_r: int,
activation=None,
add_layer_norm_before_adapter=False,
add_layer_norm_after_adapter=False,
dropout=0.0,
bias=False,
):
super(AdapterLinear, self).__init__()
self.adapter_A = nn.Linear(in_features, r,bias=bias)
nn.init.kaiming_uniform_(self.adapter_A.weight, a=math.sqrt(5))
self.adapter_B = nn.Linear(r, out_features,bias=bias)
nn.init.kaiming_uniform_(self.adapter_B.weight, a=math.sqrt(5))
# nn.init.zeros_(self.adapter_B.weight)
if activation is not None:
self.activation = Activations(activation.lower())
else:
self.activation = None
self.add_layer_norm_before_adapter = add_layer_norm_before_adapter
self.add_layer_norm_after_adapter = add_layer_norm_after_adapter
if self.add_layer_norm_before_adapter:
self.pre_layer_norm = nn.LayerNorm(in_features)
if self.add_layer_norm_after_adapter:
self.post_layer_norm = nn.LayerNorm(out_features)
if dropout > 0.0:
self.dropout_layer = nn.Identity()
else:
self.dropout_layer = nn.Dropout(p=dropout)
self.scaling = r / alpha_r
def set_bias(self, enabled=False):
self.adapter_A.bias.requires_grad = enabled
self.adapter_B.bias.requires_grad = enabled
def forward(self, x):
x = self.dropout_layer(x)
if self.add_layer_norm_before_adapter:
x = self.pre_layer_norm(x)
x = self.adapter_A(x)
if self.activation is not None:
x = self.activation(x)
y = self.adapter_B(x)
if self.add_layer_norm_after_adapter:
y = self.post_layer_norm(y)
return y