-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglove_utils.py
117 lines (93 loc) · 3.72 KB
/
glove_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# coding: utf-8
import numpy as np
import pickle
def loadGloveModel(gloveFile):
"""
Load the glove model.
"""
print ("Loading Glove Model")
f = open(gloveFile,'r', encoding='utf-8')
model = {}
for line in f:
row = line.strip().split(' ')
word = row[0]
#print(word)
embedding = np.array([float(val) for val in row[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
def save_glove_to_pickle(glove_model, file_name):
with open(file_name, 'wb', encoding='utf-8') as f:
pickle.dump(glove_model, f)
def load_glove_from_pickle(file_name):
with open(file_name, 'rb') as f:
return pickle.load(f)
def create_embeddings_matrix(glove_model, dictionary, full_dictionary=None, d=300):
MAX_VOCAB_SIZE = len(dictionary)
# Matrix size is 300
embedding_matrix = np.zeros(shape=((d, MAX_VOCAB_SIZE+1)))
cnt = 0
unfound = []
tmp = 0
for w, i in dictionary.items():
if not w in glove_model:
cnt += 1
#if cnt < 10:
# embedding_matrix[:,i] = glove_model['UNK']
unfound.append(i)
else:
embedding_matrix[:, i] = glove_model[w]
print('Number of not found words = ', cnt)
return embedding_matrix, unfound
def pick_most_similar_words_old(src_word, dist_mat, ret_count=10, threshold=None):
"""
embeddings is a matrix with (d, vocab_size)
"""
dist_order = np.argsort(dist_mat[src_word,:])[1:1+ret_count]
dist_list = dist_mat[src_word][dist_order]
if dist_list[-1] == 0:
return [], []
mask = np.ones_like(dist_list)
if threshold is not None:
mask = np.where(dist_list < threshold)
return dist_order[mask], dist_list[mask]
else:
return dist_order, dist_list
def pick_most_similar_words(src_word, small_dist_mat, ret_count=10, threshold=None):
"""
embeddings is a matrix with (d, vocab_size)
"""
# dist_order = np.argsort(small_dist_mat[src_word,:])[1:1+ret_count]
dist_order = small_dist_mat[src_word, :, 0]
dist_list = small_dist_mat[src_word, :, 1]
n_return = np.sum(dist_order > 0)
dist_order, dist_list = dist_order[:n_return], dist_list[:n_return]
dist_order, dist_list = dist_order[:ret_count], dist_list[:ret_count]
mask = np.ones_like(dist_list)
if threshold is not None:
mask = np.where(dist_list < threshold)
dist_order, dist_list = dist_order[mask], dist_list[mask]
mask1 = np.where(dist_order <= 50000)
dist_order, dist_list = dist_order[mask1], dist_list[mask1]
return dist_order, dist_list
def create_small_embedding_matrix(dist_mat, MAX_VOCAB_SIZE, threshold=1.5, retain_num=50):
"""
the memory optimized method of method `create_embeddings_matrix()`
"""
# Matrix size is 300
small_embedding_matrix = np.zeros(shape=((MAX_VOCAB_SIZE+1, retain_num, 2)))
for i in range(MAX_VOCAB_SIZE+1):
if i % 1000 == 0:
print('%d/%d processed.' % (i, MAX_VOCAB_SIZE))
dist_order = np.argsort(dist_mat[i,:])[1:1+retain_num]
dist_list = dist_mat[i][dist_order]
mask = np.ones_like(dist_list)
if threshold is not None:
mask = np.where(dist_list < threshold)
dist_order, dist_list = dist_order[mask], dist_list[mask]
n_return = len(dist_order)
dist_order_arr = np.pad(dist_order, (0, retain_num-n_return), 'constant', constant_values=(-1, -1))
dist_list_arr = np.pad(dist_list, (0, retain_num-n_return), 'constant', constant_values=(-1, -1))
small_embedding_matrix[i, :, 0] = dist_order_arr
small_embedding_matrix[i, :, 1] = dist_list_arr
return small_embedding_matrix