forked from smujjiga/SIF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim.py
174 lines (129 loc) · 4.96 KB
/
sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from tqdm import tqdm
wordfile = 'glove.840B.300d.txt'
weightfile = 'enwiki_vocab_min200.txt'
def getWordmap(textfile, only_these_words):
words = dict()
vectors = list()
if only_these_words is not None:
print ("Total words: {0}".format(len(only_these_words)))
with open(wordfile, encoding='utf-8') as fp:
# for i, line in enumerate(fp):
for i, line in enumerate(tqdm(fp)):
try:
tokens = line.split()
if only_these_words is not None:
if tokens[0] not in only_these_words:
continue
words[i] = tokens[0]
vectors.append(list(map(float, tokens[1:])))
only_these_words.remove(tokens[0])
if len(only_these_words) == 0:
break
else:
words[i] = tokens[0]
vectors.append(list(map(float, tokens[1:])))
except ValueError:
print ("Error in processing {0}".format(line))
if only_these_words is not None:
print ("Missing words: {0}".format(len(only_these_words)))
with open('missing_words.txt', 'w', encoding='utf-8') as fp:
for item in only_these_words:
fp.write("%s\n" % item)
return words, np.array(vectors)
def getWordWeight(weightfile, a=1e-3):
if a <= 0:
a = 1.0
df = pd.read_csv(weightfile, header=None, delimiter=' ')
df.columns = ['word', 'count']
df['weight'] = a / (a + (df['count'] / df['count'].sum()))
return df
def get_all_unique_words(sentences):
words = list()
for s in sentences:
for w in s.split():
words.append(w)
return list(set(words))
def lookupIDX(words, lower_words, w):
if len(w) > 1 and w[0] == '#':
w = w.replace("#", "")
if w in words:
return words[w]
elif w.lower() in lower_words:
return lower_words[w.lower()]
elif 'UUUNKKK' in words:
return words['UUUNKKK']
else:
return len(words) - 1
def sentences2idx(sentences, words, lower_words, id2weight_table):
sentences_with_wordidx = list()
for sent in sentences:
sentences_with_wordidx.append(
[lookupIDX(words, lower_words, word) for word in sent.split()])
return prepare_data(sentences_with_wordidx, id2weight_table)
def prepare_data(list_of_seqs, id2weight_table):
lengths = list(map(len, list_of_seqs))
n_samples = len(list_of_seqs)
maxlen = np.max(lengths)
x = np.zeros((n_samples, maxlen)).astype('int32') + -1
for idx, s in enumerate(list_of_seqs):
x[idx, :lengths[idx]] = s
# x_mask[idx, :lengths[idx]] = 1.
x_weights = np.zeros((n_samples, maxlen)).astype('float32')
for i in range(x.shape[0]):
for j in range(x.shape[1]):
if x[i, j] != -1:
x_weights[i, j] = id2weight_table[x[i, j]]
return x, x_weights
class params(object):
def __init__(self):
self.LW = 1e-5
self.LC = 1e-5
self.eta = 0.05
def __str__(self):
t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
t = map(str, t)
return ' '.join(t)
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
n_samples = x.shape[0]
emb = np.zeros((n_samples, We.shape[1]))
for i in range(n_samples):
emb[i, :] = w[i, :].dot(We[x[i, :], :]) / np.count_nonzero(w[i, :])
return emb
def compute_pc(X, npc=1):
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
pc = compute_pc(X, npc)
if npc == 1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def SIF_embedding(We, x, w, params):
emb = get_weighted_average(We, x, w)
if params.rmpc > 0:
emb = remove_pc(emb, params.rmpc)
return emb
def get_embeddings(sentences):
only_these_words = get_all_unique_words(sentences)
words, vectors = getWordmap(wordfile, only_these_words)
df_wordweights = getWordWeight(weightfile)
df_words = pd.DataFrame(words.values(), columns=['word'])
df_words['lowercase_word'] = df_words['word'].apply(str.lower)
df = df_words.merge(df_wordweights, left_on='lowercase_word', right_on='word', how='left')[
['word_x', 'lowercase_word', 'count', 'weight']].rename(columns={'word_x': 'word'})
df = df.fillna(1)
words2idx_table = pd.Series(df.index, index=df['word']).to_dict()
lower_words2idx_table = pd.Series(df.index, index=df['lowercase_word']).to_dict()
idx2weight_table = pd.Series(df['weight'], index=df.index).to_dict()
x, weights = sentences2idx(sentences, words2idx_table, lower_words2idx_table, idx2weight_table)
p = params()
p.rmpc = 1
encodings = SIF_embedding(vectors, x, weights, p)
return encodings