-
Notifications
You must be signed in to change notification settings - Fork 0
/
ft_from_0.py
148 lines (108 loc) · 4.8 KB
/
ft_from_0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import torch
import argparse
import time
import datetime
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
if torch.cuda.is_available():
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
parser = argparse.ArgumentParser(description='this version use already made txt file')
# 0
parser.add_argument('--LM', type=str, action='store', choices = ['Bert','RoBerta','XLM'])
# 1
group = parser.add_mutually_exclusive_group()
group.add_argument('--running', action='store_true', help='running using the original big dataset')
group.add_argument('--testing', action='store_true', help='testing using the small dataset')
# 2 epoch and batch size
parser.add_argument('--num_train_epochs', '-e', type=int)
parser.add_argument('--batch_size', '-b', type=int)
# 3
parser.add_argument('--data', type=str, action='store', choices = ['AG10K', 'wassem', 'tweet50k', 'multi-label'])
# 4
parser.add_argument('--resultpath', type=str, help='where to save the LM model')
args = parser.parse_args()
import pandas as pd
import regex as re
if args.LM == 'Bert':
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM
config = BertConfig(vocab_size=28996,
max_position_embeddings=512,
num_attention_heads=12,
num_hidden_layers=12,
#type_vocab_size=2, default is 2
)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
model = BertForMaskedLM.from_pretrained('./multi-label_LM/multi-label_Bert_e10_b16', config=config)
#model = BertForMaskedLM.from_pretrained('./multi-label_train.csv_LMmodel', config=config)
# 12-layer, 768-hidden, 12-heads, 110M parameters.
elif args.LM == 'RoBerta':
from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM
config = RobertaConfig(vocab_size=50265,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=12,
type_vocab_size=1,
)
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
# 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture
elif args.LM == 'XLM':
from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel
config = XLMConfig(vocab_size=64139,
emb_dim=1024,
max_position_embeddings=512,
n_heads=8,
n_layers=6,
)
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
# 6-layer, 1024-hidden, 8-heads
# XLM English-French model trained on the concatenation of English and French wikipedia
else:
print('need to define LM from Bert,RoBerta,XLM')
print(model)
def freeze_layer_fun(freeze_layer):
for name, param in model.named_parameters():
if freeze_layer in name:
print(name)
param.requires_grad = False
else:
pass
file_path = 'multi-label_train.csv.txt'
dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path= file_path, block_size=128)
#dataset = load_dataset("./csv_for_ft_new.py", data_files=file_path)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
dir = str(args.resultpath) + str(args.data) + '_' + str(args.LM) + '_e20' + '_b' + str(args.batch_size)
training_args = TrainingArguments(
do_train=True,
do_predict=True,
output_dir=dir,
overwrite_output_dir=True,
num_train_epochs= args.num_train_epochs,
per_device_train_batch_size=args.batch_size,
save_steps=10000,
save_total_limit=2,
)
# default learning rate(5e-5)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
prediction_loss_only=True,
)
def format_time(elapsed):
elapsed_rounded = int(round((elapsed)))
return str(datetime.timedelta(seconds=elapsed_rounded))
t0 = time.time()
trainer.train()
elapsed = format_time(time.time() - t0)
print('============= training LM time: ', elapsed)
''' Save final model (+ lm_model + config) to disk '''
trainer.save_model(dir)
print('the language model saved as ', dir)
''' check the trained lm'''