-
Notifications
You must be signed in to change notification settings - Fork 1
/
embedding_utils.py
51 lines (46 loc) · 1.56 KB
/
embedding_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
def loadEmbeddingModel(File):
print ("Loading Embedding Model:", File)
f = open(File,'r')
model = {}
for line in f:
row = line.strip().split(' ')
word = row[0]
#print(word)
embedding = np.array([float(val) for val in row[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
def create_embeddings_matrix(embedding_model, dictionary, full_dictionary, d=300):
MAX_VOCAB_SIZE = len(dictionary)
# Matrix size is 300
embedding_matrix = np.zeros(shape=((d, MAX_VOCAB_SIZE+1)))
cnt = 0
cnt2 = 0
unfound = []
for w, i in dictionary.items():
if not w in embedding_model:
cnt += 1
#if cnt < 10:
# embedding_matrix[:,i] = embedding_model['UNK']
unfound.append(i)
else:
cnt2 += 1
embedding_matrix[:, i] = embedding_model[w]
print('Number of not found words = ', cnt)
print('Number of found words = ', cnt2)
return embedding_matrix, unfound
def pick_most_similar_words(src_word, dist_mat, ret_count=10, threshold=None):
"""
embeddings is a matrix with (d, vocab_size)
"""
dist_order = np.argsort(dist_mat[src_word,:])[1:1+ret_count]
dist_list = dist_mat[src_word][dist_order]
if dist_list[-1] == 0:
return [], []
mask = np.ones_like(dist_list)
if threshold is not None:
mask = np.where(dist_list < threshold)
return dist_order[mask], dist_list[mask]
else:
return dist_order, dist_list