-
Notifications
You must be signed in to change notification settings - Fork 4
/
dnn.py
146 lines (118 loc) · 5.87 KB
/
dnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
import datetime
import time
import pickle
import h5py
from sklearn.metrics import accuracy_score
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dense, Embedding, LSTM, Dropout, Activation, concatenate, BatchNormalization
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
class QaDNN(object):
def __init__(self, questions=None, answers=None, labels=None, word_embeddings=None, word_embeddings_trainable=True, val_questions=None, val_answers=None, val_labels=None, maxlen=40, epochs=5, batch_size=30, load_from_file=None):
self.questions = np.array(questions)
self.answers = np.array(answers)
self.labels = np.array(labels)
self.val_questions = np.array(val_questions)
self.val_answers = np.array(val_answers)
self.val_labels = np.array(val_labels)
self.maxlen = maxlen
self.epochs = epochs
self.batch_size = batch_size
if load_from_file is not None:
self.load_model(load_from_file)
return
vocab_size, embedding_size = word_embeddings.shape
print('Shape of questions:', self.questions.shape)
print('Shape of answers:', self.answers.shape)
print('Shape of labels:', self.labels.shape)
# model input
q_input = Input(shape=(maxlen,))
a_input = Input(shape=(maxlen,))
# embedding layer
embedding_layer = Embedding(vocab_size, embedding_size, weights=[word_embeddings], input_length=maxlen,
trainable=word_embeddings_trainable)
# convert question and answer input to word embedding matrix separately
q = embedding_layer(q_input)
a = embedding_layer(a_input)
# add LSTM layer, get their new representation
q = LSTM(50, return_sequences=True)(q)
a = LSTM(50, return_sequences=True)(a)
q = LSTM(50, return_sequences=False)(q)
a = LSTM(50, return_sequences=False)(a)
# concatenate those two vectors for classification
merged = concatenate([q, a])
# add dense layer, and dropout, batch normalization to prevent over fitting
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
# another same layer
merged = Dense(200, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
# output the probability
output = Dense(1, activation='sigmoid')(merged)
self.model = Model(inputs=[q_input, a_input], output=output)
# use cross entropy as loss function and Adam to optimize
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-6, amsgrad=False)
self.model.compile(loss='binary_crossentropy', optimizer=adam)
def load_model(self, path):
self.model = load_model(path)
def save_model(self, path):
self.model.save(path)
def train(self):
# padding input first
questions_padded = pad_sequences(self.questions, maxlen=self.maxlen, padding='post')
answers_padded = pad_sequences(self.answers, maxlen=self.maxlen, padding='post')
# get validation data
validation_data = None
if not self.val_questions is None:
val_questions_padded = pad_sequences(self.val_questions, maxlen=self.maxlen, padding='post')
val_answers_padded = pad_sequences(self.val_answers, maxlen=self.maxlen, padding='post')
validation_data = ([val_questions_padded, val_answers_padded], [self.val_labels])
print("Starting training at", datetime.datetime.now)
t0 = time.time()
callbacks = [ModelCheckpoint("model/question_answer_weights.h5", monitor='val_loss', save_best_only=True)]
# as our data for binary classification is skewed, give more weights to class with less samples
class_weight = {0: 1.,
1: 6.}
# start training
history = self.model.fit([questions_padded, answers_padded], [self.labels], epochs=self.epochs, batch_size=self.batch_size, callbacks=callbacks,
validation_data=validation_data, class_weight=class_weight)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))
min_val_loss, idx = min((val, idx) for (idx, val) in enumerate(history.history['val_loss']))
print("Minimum validation loss = {0:.4f} (epoch {1:d})".format(min_val_loss, idx + 1))
'''
predict whether a question and answer match
'''
def predict(self, questions, answers):
questions_padded = pad_sequences(questions, maxlen=self.maxlen, padding='post')
answers_padded = pad_sequences(answers, maxlen=self.maxlen, padding='post')
return self.model.predict([questions_padded, answers_padded])
'''
give a question with multi candidate answers, rank those answers
'''
def rank(self, question, answers):
questions = np.tile(question, (len(answers), 1))
scores = [item[0] for item in self.predict(questions, answers)]
sorted_answers = sorted(enumerate(scores), key=lambda k:k[1], reverse=True)
return sorted_answers
'''
calculate Mean Reciprocal Rank (MRR) score
'''
def evaluate(self, test_data):
MRR = 0
count = 0
for question, candidates, labels in test_data:
count += 1
true_answer = labels.index(1)
rank = self.rank(question, candidates)
ranking = [item[0] for item in rank].index(true_answer)
MRR += 1 / (ranking + 1)
return MRR / count