-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_rnn_classifier.py
159 lines (123 loc) · 5 KB
/
simple_rnn_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# this is a simple cnn-rnn classifier
import codecs, re
import numpy as np
# Scikit-Learn and NLTK for preprocessing
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# Keras for neural network classifier
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.callbacks import EarlyStopping
from keras.models import save_model, load_model
# parameters
MAX_VOCAB = 18000
MAX_LENGTH = 50
EMBEDDING_SIZE = 64
BATCH_SIZE = 32
MAX_EPOCHS = 25
DROPOUT_RATE = 0.4
stop_monitor = 'val_loss' # variable for early stop: (default = val_loss)
stop_delta = 0.0 # minimum delta before early stop (default = 0)
stop_epochs = 2 # how many epochs to do after stop condition (default = 0)
# load the data.
print("Loading data...\n")
# todo : edit filenames here
f_sents = codecs.open('datasets/brown_sents.txt', 'rb', encoding='utf8')
f_classes = codecs.open('datasets/brown_topics.txt', 'rb', encoding='utf8')
sents = [sent.strip() for sent in f_sents.readlines()]
labels = [label.strip() for label in f_classes.readlines()]
# number of labels
num_labels = len(set(labels))
# we can create a custom tokenizer to clean and preprocess the data.
print("Fitting tokenizer...\n")
# we can use tokenizing function using sklearn, etc here
# so we can get fancy here with stopwords, etc
def tokenize(sentence):
stemmer = SnowballStemmer("english")
sentence = sentence.lower()
sentence = re.sub(r'[^\w\s]', '', sentence)
wordlist = sentence.strip('\n').split(' ')
result = [stemmer.stem(word) for word in wordlist]
return result
# get count vectors
# https://github.com/fchollet/keras/issues/17
sentvectorizer = CountVectorizer(tokenizer=tokenize, max_features=MAX_VOCAB-1)
sentvectorizer.fit(sents)
# prepare labels
print("Preparing labels...\n")
encoder = LabelEncoder()
labels = encoder.fit_transform(labels)
# generate new training data
print('generating training data...\n')
X_train, X_test, y_train, y_test = train_test_split(sents, labels, test_size=0.2)
train_sents = X_train
test_sents = X_test
X_train = sentvectorizer.transform(X_train)
X_train =[row.indices for row in X_train]
X_test = sentvectorizer.transform(X_test)
X_test =[row.indices for row in X_test]
# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen=MAX_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_LENGTH)
# one-hot encoding for output
# https://www.reddit.com/r/MachineLearning/comments/31fk7i/converting_target_indices_to_onehotvector/
y_train = np.eye(num_labels)[y_train]
y_test = np.eye(num_labels)[y_test]
# check data
print(X_train[0])
print(y_train[0])
print('')
'''
for small sentences, a deep RNN without any convolutional layers works well
'''
# create the model
model = Sequential()
# embedding layer - initialized randomly and trained with model
model.add(Embedding(MAX_VOCAB, EMBEDDING_SIZE, input_length=MAX_LENGTH))
# convolution and max pooling layers
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Dropout(DROPOUT_RATE))
# RNN (LSTM) layers - can copy the first one for even deeper NN
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(DROPOUT_RATE))
model.add(LSTM(128))
# dense layer that outputs class
# apparently adding another Dense layer also helps classification
model.add(Dense(num_labels*3, activation='relu'))
model.add(Dense(num_labels, activation='softmax')) # sigmoid ok
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# callback for early stoppage
earlystop = EarlyStopping(monitor=stop_monitor,
min_delta=stop_delta,
patience=stop_epochs,
verbose=1,
mode='auto')
callbacks_list = [earlystop] # add model checkpointing etc here
model.fit(X_train, y_train,
validation_data=(X_test, y_test),
nb_epoch=MAX_EPOCHS,
batch_size=BATCH_SIZE,
callbacks=callbacks_list
)
model.save('trained_model.h5')
print("saved model to disk")
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=1)
print("\n\nAccuracy: %.2f%%" % (scores[1]*100), '\n')
# Final evaluation of the model
preds = model.predict(X_test, verbose=1)
print('\n')
for idx, pred in enumerate(preds[:10]):
#for ytest & pred, we need INDEX of MAXIMUM value, hence the mess
print(encoder.inverse_transform([list(y_test[idx]).index(max(y_test[idx]))])[0], "|",
encoder.inverse_transform([list(pred).index(max(pred))])[0], " : ",
test_sents[idx])