-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_finetuned.py
100 lines (78 loc) · 3.15 KB
/
run_finetuned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# # '''
# # BERT - Inference only
# # This is the final inference script to be as a base for the desktop application.
# # It uses the fine-tuned and trained model downloaded from the Google Colab
# # notebook written for training. Takes the user input text, from console,
# # writes it to a csv file, and then makes the prediction using the csv file
# # and finally returns the result to the user.
# # '''
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np
import csv
from csv import writer
# # Pre-trained and fine-tuned Model and tokenizer import
model_name = "../BERT/bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
# # Adjusting the model to load and use it for this runtime
model_fn = '../BERT/bert-base-cased/pytorch_model.bin'
model_state_dict = torch.load(model_fn,
map_location=torch.device('cpu')) # No need for map_location while using at a notebook
model = BertForSequenceClassification.from_pretrained(model_name, state_dict=model_state_dict, num_labels=1)
# # No need for the below lines since it's already done in the training, but
# # if it gives unusual results, just uncomment it and run again.
# # model.classifier = nn.Linear(768, 1)
# # model.num_labels = 1
def tokenize(dataset):
return tokenizer(dataset['excerpt'], padding='max_length', truncation=True, max_length=512)
# # The score calculation
def compute_metrics(eval_pred):
logits, labels = eval_pred
logits, labels = logits.squeeze(), labels.squeeze()
rmse = np.sqrt(np.mean((labels - logits) ** 2))
return {'RMSE': rmse}
# # All the training arguments are excluded.
# # Only the model and the calculation function needed for inference.
trainer = Trainer(
model=model,
compute_metrics=compute_metrics
)
def calculate_target(text):
# test.csv acts as the middle man between BERT and the text input
# input goes into that csv file and BERT reads that file to
# calculate the score.
List=[text]
List2=['excerpt']
with open('../BERT/test.csv', 'a') as f_object:
writer_object = writer(f_object)
writer_object.writerow(List2)
writer_object.writerow(List)
f_object.close()
test = load_dataset('csv', data_files=['../BERT/test.csv'])
test_dataset = test.map(tokenize, batched=True)
test_dataset = test_dataset.remove_columns(['excerpt'])
test_dataset = test_dataset['train']
output = trainer.predict(test_dataset)
target = output.predictions.squeeze()
clean_file(text)
target = (3 + (1.386119 - target) / 0.483295)
target = round(target)
return str(target)
# # Below code from here cleans up the csv file for the next
# # program execution.
def clean_file(text):
imp = open('../BERT/test.csv', 'rb')
out = open('../BERT/test.csv', 'wb')
writer = csv.writer(out)
for row in csv.reader(imp):
if row == text:
writer.writerow(row)
imp.close()
out.close()
# import sys
# text = sys.argv[1]
# def calculate_target(text):
# return text
# print(calculate_target(text))