-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
715 lines (603 loc) · 26.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
import sys, datetime, os, math
import numpy as np
import classifiers, re
import preprocess as data_proc
from keras.models import model_from_json
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
import keras.backend as K
from collections import Counter, OrderedDict
from pandas import read_csv
from numpy.random import seed, shuffle
import pandas as pd
import json
import config
def Config(dbname):
if configname != config.MODEL_CONFIG['dbname']:
raise ValueError("Couldn't not find DB with given name")
conn = pymysql.connect(host=config.DATABASE_CONFIG['host'],
user=config.DATABASE_CONFIG['user'],
password=config.DATABASE_CONFIG['password'],
db=config.DATABASE_CONFIG['dbname'])
return conn
def from_json(file_path): # load config from json file
return json.load(open(file_path, "r"))
def load_file(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text.strip().split("\n")
def load_csv(file_path, *value):
df = pd.read_csv(file_path)
if value:
attr = df[value[0]]
return attr
return df
def save_file(lines, filename):
data = '\n'.join(lines)
file_path = re.sub('/\w+.txt', '' ,filename)
if not os.path.exists(file_path):
os.makedirs(file_path)
file = open(filename, 'w')
file.write(data)
file.close()
def load_data_panda(filename, shuffle_sets=False):
print("Reading data from file %s..." % filename)
data = read_csv(filename, sep="\t+", header=None, engine='python')
data.columns = ["Set", "Label", "Text"]
print('The shape of this data set is: ', data.shape)
x_train, labels_train = np.array(data["Text"]), np.array(data["Label"])
if shuffle_sets:
np.random.seed(12346598)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
x_train = x_train[indices]
labels_train = labels_train[indices]
return x_train, labels_train
def save_as_dataset(data, labels, filename):
lines = []
first_word = "TrainSet" if "train" in filename else "TestSet"
for i in range(len(labels)):
if data[i] is not None:
lines.append(first_word + '\t' + str(labels[i]) + '\t' + str(data[i]))
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()
def save_dictionary(dictionary, filename):
lines = []
for k, v in dictionary.items():
lines.append(k + '\t' + str(v))
file = open(filename, 'w')
file.write('\n'.join(lines))
file.close()
def load_dictionary(filename):
dictionary = {}
file = open(filename, 'r')
lines = file.read()
file.close()
for line in lines.split("\n"):
key, value = line.split("\t")
dictionary[key] = value
return dictionary
def save_model(model, json_name, h5_weights_name):
model_json = model.to_json()
with open(json_name, "w") as json_file:
json_file.write(model_json)
model.save_weights(h5_weights_name)
print("Saved model with json name %s, and weights %s" % (json_name, h5_weights_name))
def load_model(json_name, h5_weights_name, verbose=False):
# In case of saved model (not to json or yaml)
# model = models.load_model(model_path, custom_objects={'f1_score': f1_score})
loaded_model_json = open(json_name, 'r').read()
model = model_from_json(loaded_model_json)
model.load_weights(h5_weights_name)
if verbose:
print("Loaded model with json name %s, and weights %s" % (json_name, h5_weights_name))
return model
# Given any number of dicts, shallow copy and merge into a new dict,
# precedence goes to key value pairs in latter dicts.
# This is in case a Python3.5 version is NOT used. (needed for my access to the zCSF cluster)
def merge_dicts(*dict_args):
result = {}
for dictionary in dict_args:
result.update(dictionary)
return result
# Just a primitive batch generator
def batch_generator(x, y, batch_size):
seed(1655483)
size = x.shape[0]
x_copy = x.copy()
y_copy = y.copy()
indices = np.arange(size)
np.random.shuffle(indices)
x_copy = x_copy[indices]
y_copy = y_copy[indices]
i = 0
while True:
if i + batch_size <= size:
yield x_copy[i:i + batch_size], y_copy[i:i + batch_size]
i += batch_size
else:
i = 0
indices = np.arange(size)
np.random.shuffle(indices)
x_copy = x_copy[indices]
y_copy = y_copy[indices]
continue
def shuffle_data(labels, n):
seed(532908)
indices = range(len(labels))
pos_indices = [i for i in indices if labels[i] == 1]
neg_indices = [i for i in indices if labels[i] == 0]
shuffle(pos_indices)
shuffle(neg_indices)
top_n = pos_indices[0:n] + neg_indices[0:n]
shuffle(top_n)
return top_n
# Get some idea about the max and mean length of the tweets (useful for deciding on the sequence length)
from nltk import TweetTokenizer
tkr = TweetTokenizer()
def get_max_len_info(tweets, average=False):
# sum_of_length = sum([tkr.tokenize(l) for l in tweets])
sum_of_length = sum([len(str(l).split()) for l in tweets])
avg_tweet_len = sum_of_length / float(len(tweets))
print("Mean of train tweets: ", avg_tweet_len)
max_tweet_len = len(tkr.tokenize(max(str(tweets), key=len)))
print("Max tweet length is = ", max_tweet_len)
if average:
return avg_tweet_len
return max_tweet_len
def get_classes_ratio(labels):
positive_labels = sum(labels)
negative_labels = len(labels) - sum(labels)
ratio = [max(positive_labels, negative_labels) / float(negative_labels),
max(positive_labels, negative_labels) / float(positive_labels)]
print("Class ratio: ", ratio)
return ratio
def get_classes_ratio_as_dict(labels):
ratio = Counter(labels)
ratio_dict = {0: float(max(ratio[0], ratio[1]) / ratio[0]), 1: float(max(ratio[0], ratio[1]) / ratio[1])}
print('Class ratio: ', ratio_dict)
return ratio_dict
def extract_features_from_dict(train_features, test_features):
# Transform the list of feature-value mappings to a vector
vector = DictVectorizer(sparse=False)
# Learn a list of feature name -> indices mappings and transform X_train_features
x_train_features = vector.fit_transform(train_features).tolist()
# Just transform the X_test_features, based on the list fitted on X_train_features
# Disadvantage: named features not encountered during fit_transform will be silently ignored.
x_test_features = vector.transform(test_features).tolist()
print('Size of the feature sets: train = ', len(x_train_features[0]), ', test = ', len(x_test_features[0]))
return x_train_features, x_test_features
def feature_scaling(features):
scaled_features = []
max_per_col = []
for i in range(len(features[0])):
maxx = max([abs(f[i]) for f in features])
if maxx == 0.0:
maxx = 1.0
max_per_col.append(maxx)
for f in features:
scaled_features.append([float(f[i]) / float(max_per_col[i]) for i in range(len(f))])
return scaled_features
def run_supervised_learning_models(train_features, train_labels, test_features, test_labels,
make_feature_analysis=False, feature_names=None, top_features=0, plot_name="coeff"):
class_ratio = get_classes_ratio_as_dict(train_labels) # alternatively, can be set class_ratio = 'balanced'
classifiers.linear_svm_grid(train_features, train_labels, test_features, test_labels, class_ratio,
make_feature_analysis, feature_names, top_features, plot_name)
classifiers.logistic_regression_grid(train_features, train_labels, test_features, test_labels, class_ratio,
make_feature_analysis, feature_names, top_features, plot_name)
# classifiers.nonlinear_svm(train_features, train_labels, test_features, test_labels, class_ratio,
# make_feature_analysis, feature_names, top_features, plot_name)
# Convert tweets into an array of indices of shape (m, max_tweet_length)
def tweets_to_indices(tweets, word_to_index, max_tweet_len):
m = tweets.shape[0]
tweet_indices = np.zeros((m, max_tweet_len))
for i in range(m):
sentence_words = [w.lower() for w in tweets[i].split()]
j = 0
for w in sentence_words:
tweet_indices[i, j] = word_to_index[w]
j = j + 1
return tweet_indices
def encode_text_as_matrix(train_tweets, test_tweets, mode, max_num_words=None, lower=False, char_level=False):
# Create the tokenizer
tokenizer = Tokenizer(num_words=max_num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=lower, split=" ", char_level=char_level)
# Fit the tokenizer on the documents
tokenizer.fit_on_texts(train_tweets)
# Encode each example using a 'mode' scoring method (mode can be count, binary, freq, tf-idf)
x_train = tokenizer.texts_to_matrix(train_tweets, mode=mode)
x_test = tokenizer.texts_to_matrix(test_tweets, mode=mode)
return tokenizer, x_train, x_test
def encode_text_as_word_indexes(train_tweets, test_tweets, max_num_words=None, lower=False, char_level=False):
# Create the tokenizer
tokenizer = Tokenizer(num_words=max_num_words, filters='', lower=lower, split=" ", char_level=char_level)
# Fit the tokenizer on the documents
tokenizer.fit_on_texts(train_tweets)
# Encode each example as a sequence of word indexes based on the vocabulary of the tokenizer
x_train = tokenizer.texts_to_sequences(train_tweets)
x_test = tokenizer.texts_to_sequences(test_tweets)
return tokenizer, x_train, x_test
# Build random vector mappings of a vocabulary
def build_random_word2vec(tweets, embedding_dim=100, variance=1):
tweets = str(tweets)
print("\nBuilding random vector of mappings with dimension %d..." % embedding_dim)
word2vec_map = {}
seed(1457873)
words = set((' '.join(tweets)).split())
for word in words:
embedding_vector = word2vec_map.get(word)
if embedding_vector is None:
word2vec_map[word] = np.random.uniform(-variance, variance, size=(embedding_dim,))
return word2vec_map
# Load a set of pre-trained embeddings (can be GLoVe or emoji2vec)
def load_vectors(filename='glove.6B.100d.txt'):
print("\nLoading vector mappings from %s..." % filename)
word2vec_map = {}
if 'emoji' in filename:
f = open(path + '/models/emoji2vec/' + filename)
else: # by default, load the GLoVe embeddings
f = open(path + '/res/glove/' + filename)
for line in f:
values = line.split()
word = values[0]
weights = np.asarray(values[1:], dtype='float32')
word2vec_map[word] = weights
f.close()
print('Found %s word vectors and with embedding dimmension %s'
% (len(word2vec_map), next(iter(word2vec_map.values())).shape[0]))
return word2vec_map
# Compute the word-embedding matrix
def get_embedding_matrix(word2vec_map, word_to_index, embedding_dim, init_unk=True, variance=None):
# Get the variance of the embedding map
if init_unk and variance is None:
variance = embedding_variance(word2vec_map)
print("Word vectors have variance ", variance)
# Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors)
embedding_matrix = np.zeros((len(word_to_index) + 1, embedding_dim))
for word, i in word_to_index.items():
embedding_vector = word2vec_map.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
elif init_unk:
# Unknown tokens are initialized randomly by sampling from a uniform distribution [-var, var]
seed(1337603)
embedding_matrix[i] = np.random.uniform(-variance, variance, size=(1, embedding_dim))
# else:
# print("Not found: ", word)
return embedding_matrix
# Get the vec representation of a set of tweets based on a specified embedding (can be a word or emoji mapping)
def get_tweets_embeddings(tweets, vec_map, embedding_dim=100, init_unk=False, variance=None, weighted_average=True):
# Get the variance of the embedding map
if init_unk and variance is None:
variance = embedding_variance(vec_map)
print("Vector mappings have variance ", variance)
# If set, calculate the tf-idf weight of each embedding, otherwise, no weighting (all weights are 1.0)
if weighted_average:
weights = get_tf_idf_weights(tweets, vec_map)
else:
weights = {k: 1.0 for k in vec_map.keys()}
tw_emb = np.zeros((len(tweets), embedding_dim))
for i, tw in enumerate(tweets):
total_valid = 0
for word in tw.split():
embedding_vector = vec_map.get(word)
if embedding_vector is not None:
tw_emb[i] = tw_emb[i] + embedding_vector * weights[word]
total_valid += 1
elif init_unk:
seed(1337603)
tw_emb[i] = np.random.uniform(-variance, variance, size=(1, embedding_dim))
# else:
# print("Not found: ", word)
# Get the average embedding representation for this tweet
tw_emb[i] /= float(max(total_valid, 1))
return tw_emb
# Based on the deepmoji project, predicting emojis for each tweet -- done using their pre-trained weights
# Here we extract the relevant emojis (with an individual probability of being accurate over teh set threshold)
def get_deepmojis(filename, threshold=0.05):
print("\nGetting deep-mojis for each tweet in %s..." % filename)
df = read_csv(path + "/res/deepmoji/" + filename, sep='\t')
pred_mappings = load_file(path + "/res/emoji/wanted_emojis.txt")
emoji_pred = []
for index, row in df.iterrows():
tw_pred = []
for top in range(5):
if row['Pct_%d' % (top+1)] >= threshold:
tw_pred.append(row['Emoji_%d' % (top + 1)])
emoji_pred.append([pred_mappings[t] for t in tw_pred])
print("Couldn't find a strong emoji prediction for %d emojis" % len([pred for pred in emoji_pred if pred == []]))
return emoji_pred
# Just a dummy function that I used for the demo (printing the predicted deepmojis for each tweet in my demo set)
def get_demo_emojis(filename, data):
deepmojis = load_file(path + "/res/datasets/demo/deepmoji_" + filename)
emojis = data_proc.extract_emojis(data)
all_emojis = [deepmojis[i] if emojis[i] == ''
else emojis[i] + ' ' + deepmojis[i] for i in range(len(emojis))]
all_emojis = [' '.join(set(e.split())) for e in all_emojis]
for d, e in zip(data[20:40], all_emojis[20:40]):
print("Tweet: ", d)
print("Predicted emojis: ", e, "\n")
return all_emojis
# Calculate the variance of an embedding (like glove, word2vec, emoji2vec, etc)
# Used to sample new uniform distributions of vectors in the interval [-variance, variance]
def embedding_variance(vec_map):
variance = np.sum([np.var(vec) for vec in vec_map.values()]) / len(vec_map)
return variance
# Shuffle the words in all tweets
def shuffle_words(tweets):
shuffled = []
for tweet in tweets:
words = [word for word in tweet.split()]
np.random.shuffle(words)
shuffled.append(' '.join(words))
return shuffled
# Get the tf-idf weighting scheme (used to measure the contribution of a word in a tweet => weighted sum of embeddings)
def get_tf_idf_weights(tweets, vec_map):
df = {}
for tw in tweets:
words = set(tw.split())
for word in words:
if word not in df:
df[word] = 0.0
df[word] += 1.0
idf = OrderedDict()
for word in vec_map.keys():
n = 1.0
if word in df:
n += df[word]
score = math.log(len(tweets) / float(n))
idf[word] = score
return idf
# Compute the similarity of 2 vectors, both of shape (n, )
def cosine_similarity(u, v):
dot = np.dot(u, v)
norm_u = np.sqrt(np.sum(u ** 2))
norm_v = np.sqrt(np.sum(v ** 2))
cosine_distance = dot / (norm_u * norm_v)
return cosine_distance
# Convert emojis to unicode representations by removing any variation selectors
# Info: http://www.unicode.org/charts/PDF/UFE00.pdf
def convert_emoji_to_unicode(emoji):
unicode_emoji = emoji.encode('unicode-escape')
find1 = unicode_emoji.find(b"\\ufe0f")
unicode_emoji = unicode_emoji[:find1] if find1 != -1 else unicode_emoji
find2 = unicode_emoji.find(b"\\ufe0e")
unicode_emoji = unicode_emoji[:find2] if find2 != -1 else unicode_emoji
return unicode_emoji
# Performs the word analogy task: a is to b as c is to ____.
def make_analogy(a, b, c, vec_map):
a = convert_emoji_to_unicode(a)
b = convert_emoji_to_unicode(b)
c = convert_emoji_to_unicode(c)
e_a, e_b, e_c = vec_map[a], vec_map[b], vec_map[c]
max_cosine_sim = -100
best = None
best_list = {}
for v in vec_map.keys():
# The best match shouldn't be one of the inputs, so pass on them.
if v in [a, b, c]:
continue
# Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)
cosine_sim = cosine_similarity(e_b - e_a, vec_map[v] - e_c)
best_list[v] = cosine_sim
if cosine_sim > max_cosine_sim:
max_cosine_sim = cosine_sim
best = v
sorted_keys = sorted(best_list, key=best_list.get, reverse=True)
print("Top 5 most similar emojis: ", [r.decode('unicode-escape') for r in sorted_keys[:5]])
print(str.format('{} - {} + {} = {}', a.decode('unicode-escape'), b.decode('unicode-escape'),
c.decode('unicode-escape'), best.decode('unicode-escape')), "\n\n")
# Get the Euclidean distance between two vectors
def euclidean_distance(u_vector, v_vector):
distance = np.sqrt(np.sum([(u - v) ** 2 for u, v in zip(u_vector, v_vector)]))
return distance
# Given a tweet, return the scores of the most similar/dissimilar pairs of words
def get_similarity_measures(tweet, vec_map, weighted=False, verbose=True):
# Filter a bit the tweet so that no punctuation and no stopwords are included
stopwords = data_proc.get_stopwords_list()
filtered_tweet = list(set([w.lower() for w in tweet.split()
if w.isalnum() and w not in stopwords and w.lower() in vec_map.keys()]))
# Compute similarity scores between any 2 words in filtered tweet
similarity_scores = []
max_words = []
min_words = []
max_score = -100
min_score = 100
for i in range(len(filtered_tweet) - 1):
wi = filtered_tweet[i]
for j in range(i + 1, len(filtered_tweet)):
wj = filtered_tweet[j]
similarity = cosine_similarity(vec_map[wi], vec_map[wj])
if weighted:
similarity /= euclidean_distance(vec_map[wi], vec_map[wj])
similarity_scores.append(similarity)
if max_score < similarity:
max_score = similarity
max_words = [wi, wj]
if min_score > similarity:
min_score = similarity
min_words = [wi, wj]
if verbose:
print("Filtered tweet: ", filtered_tweet)
if max_score != -100:
print("Maximum similarity is ", max_score, " between words ", max_words)
else:
print("No max! Scores are: ", similarity_scores)
if min_score != 100:
print("Minimum similarity is ", min_score, " between words ", min_words)
else:
print("No min! Scores are: ", similarity_scores)
return max_score, min_score
# Custom metric function adjusted from https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
def f1_score(y_true, y_pred):
# Recall metric. Only computes a batch-wise average of recall,
# a metric for multi-label classification of how many relevant items are selected.
def recall(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
# Precision metric. Only computes a batch-wise average of precision,
# a metric for multi-label classification of how many selected items are relevant.
def precision(y_true, y_pred):
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
precision = precision(y_true, y_pred)
recall = recall(y_true, y_pred)
return 2 * ((precision*recall) / (precision+recall))
# This code allows you to see the mislabelled examples
def analyse_mislabelled_examples(x_test, y_test, y_pred):
for i in range(len(y_test)):
num = np.argmax(y_pred[i])
if num != y_test[i]:
print('Expected:', y_test[i], ' but predicted ', num)
print(x_test[i])
def print_statistics(y, y_pred):
accuracy = metrics.accuracy_score(y, y_pred)
precision = metrics.precision_score(y, y_pred, average='weighted')
recall = metrics.recall_score(y, y_pred, average='weighted')
f_score = metrics.f1_score(y, y_pred, average='weighted')
print('Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nF_score: %.3f\n'
% (accuracy, precision, recall, f_score))
print(metrics.classification_report(y, y_pred))
return accuracy, precision, recall, f_score
def plot_training_statistics(history, plot_name, also_plot_validation=False, acc_mode='acc', loss_mode='loss'):
# Plot Accuracy
plt.figure()
plt.plot(history.history[acc_mode], 'k-', label='Training Accuracy')
if also_plot_validation:
plt.plot(history.history['val_' + acc_mode], 'r--', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
else:
plt.title('Training Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='center right')
plt.ylim([0.0, 1.0])
plt.savefig(path + plot_name + "_acc.png")
print("Plot for accuracy saved to %s" % (path + plot_name + "_acc.png"))
# Plot Loss
plt.figure()
plt.plot(history.history[loss_mode], 'k-', label='Training Loss')
if also_plot_validation:
plt.plot(history.history['val_' + loss_mode], 'r--', label='Validation Loss')
plt.title('Training vs Validation Loss')
else:
plt.title('Training Loss ')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='center right')
plt.savefig(path + plot_name + "_loss.png")
print("Plot for loss saved to %s" % (path + plot_name + "_loss.png"))
# This is used to plot the coefficients that have the greatest impact on a classifier like SVM
# feature_names = a dictionary of indices/feature representations to words (or whatever you're extracting features from)
def plot_coefficients(classifier, feature_names, top_features=20, plot_name="/bow_models/bow_binary_"):
# Get the top most positive/negative coefficients
coef = classifier.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
x_names = [feature_names[feature] for feature in top_coefficients]
# Plot the coefficients
plt.figure(figsize=(15, 5))
colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
plt.xticks(np.arange(0, 2 * top_features), x_names, rotation=30, ha='right')
plt.ylabel("Coefficient Value")
plt.title("Visualising the top %d features taken up by an SVM model" % top_features)
to_save_filename = path + "/plots/" + plot_name + "top%d_coefficients.png" % top_features
plt.savefig(to_save_filename)
print("Coefficients' visualisation saved to %s\n" % to_save_filename)
# Plot the results obtained for different settings (stored in a Data Frame) as a boxplot
def boxplot_results(results, filename):
if not results.empty:
plt.figure()
results.boxplot()
to_save_filename = path + "/plots/bow_models/" + filename
plt.savefig(to_save_filename)
print("Boxplot saved to %s " % to_save_filename)
# Method to print the features used
# feature_options - list of True/False values; specifies which set of features is active
# feature_names - list of names for each feature set
def print_features(feature_options, feature_names):
print("\n========================= FEATURES =========================\n")
for name, value in zip(feature_names, feature_options):
line_new = '{:>30} {:>10}'.format(name, value)
print(line_new)
print("\n==================================================================\n")
def print_feature_values(current_set, pragmatic, lexical, pos, sent, topic, sim):
i = 0
print("\n========================= %s FEATURES =========================\n" % current_set)
for a, b, c, d, e, f in zip(pragmatic, lexical, pos, sent, topic, sim):
print("\nTweet ", i)
print("Pragmatic:\n", a)
print("Lexical:\n", b)
print("POS:\n", c)
print("Sentiment:\n", d)
print("Topic:\n", e)
print("Similarity:\n", f)
print("\n==================================================================\n")
i += 1
def print_feature_values_demo(tweets, pragmatic, lexical, pos, sent, topic, sim):
print("\n========================= FEATURE VALUES =========================\n")
for tw, a, b, c, d, e, f in zip(tweets[:20], pragmatic[:20], lexical[:20], pos[:20],
sent[:20], topic[:20], sim[:20]):
print("Tweet:\t", tw)
print("Pragmatic:\t", a)
print("Lexical:\t", b)
print("POS:\t", c)
print("Sentiment:\t", d)
print("Topic:\t", e)
print("Similarity:\t", f)
print("\n==================================================================\n")
# Method to print the header of the currently running model
def print_model_title(name):
print("\n==================================================================")
print('{:>20}'.format(name))
print("==================================================================\n")
# Method that prints the settings for each DNN model
def print_settings(max_tweet_length, embedding_vector_dim, hidden_units,
epochs, batch_size, dropout, emb_type, trainable):
print_model_title("Settings")
print("Max tweet length = ", max_tweet_length)
print("Embedding vector dimension = ", embedding_vector_dim)
print("Hidden units = ", hidden_units)
print("Epochs = ", epochs)
print("Batch size = ", batch_size)
print("Dropout = ", dropout)
print("Embeddings type = ", emb_type)
print("Trainable = ", trainable)
print("==================================================================\n")
# This allows me to print both to file and to standard output at the same time
class writer:
def __init__(self, *writers):
self.writers = writers
def write(self, text):
for w in self.writers:
w.write(text)
def flush(self):
pass
def initialize_writer(file_path):
dir_path = file_path.replace(file_path.split('/')[-1] ,'')
if not os.path.exists(dir_path):
os.makedirs(dir_path)
fout = open(file_path, 'a')
sys.stdout = writer(sys.stdout, fout)
print("\n\n > Current date and time: %s\n" % str(datetime.datetime.now()))
def get_max_len_info(tweets, average=False):
# sum_of_length = sum([tkr.tokenize(l) for l in tweets])
sum_of_length = sum([len(str(l).split()) for l in tweets])
avg_tweet_len = sum_of_length / float(len(tweets))
print("Mean of train tweets: ", avg_tweet_len)
max_tweet_len = len(tkr.tokenize(max(str(tweets), key=len)))
print("Max tweet length is = ", max_tweet_len)
if average:
return avg_tweet_len
return max_tweet_len