-
Notifications
You must be signed in to change notification settings - Fork 2
/
embed_funcs.py
80 lines (66 loc) · 2.66 KB
/
embed_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
import sys
from sklearn.utils import check_random_state
class WordEmbeddings(object):
def __init__(self):
self.num_words = 0
self.total_count = 0
self.words = []
self.embedding_dim = 0
self.vectors = np.zeros((0, 0))
self.counts = np.zeros(0, dtype=int)
self.probs = np.zeros(0)
self.word_dict = dict([])
def load_from_word2vec(self, file_prefix):
vocab_file = file_prefix + '.vocab'
vec_file = file_prefix + '.bin'
vec_fs = open(vec_file)
line = vec_fs.readline()
tokens = line.split()
self.num_words = int(tokens[0]) # number of unique words( size of vocab)
self.counts = np.zeros(self.num_words, dtype=int) # count of each word in vocab
self.embedding_dim = int(tokens[1]) # embedding dimensions (typically 300)
self.vectors = np.zeros((self.num_words, self.embedding_dim)) # Embedding matrix
self.probs = np.ones(self.num_words) # word probabilities
c1 = 0
for i, line in enumerate(vec_fs):
if not line or line == "\n":
continue
tokens = line.split()
word = tokens[0]
self.words.append(word)
self.word_dict[word] = i
self.vectors[i] = [float(x) for x in tokens[1:]]
c1 = i
vocab_fs = open(vocab_file)
for line in vocab_fs:
if not line or line == "\n":
continue
tokens = line.split(":")
print(line)
word, count = tokens[0], int(tokens[1])
if word in self.word_dict:
print("Count:",count," word:",word," Dict key:",self.word_dict[word])
self.counts[self.word_dict[word]] = count
self.total_count = self.counts.sum() # total number of words
self.probs = self.probs * self.counts
self.probs = self.probs / self.total_count # calculating all probabilities
# def downsample_frequent_words(self,
def downsample_frequent_words(self, frequency_threshold = 1e-3):
threshold_count = float(frequency_threshold * self.total_count)
self.probs = (np.sqrt(self.counts / threshold_count) + 1) * (threshold_count / self.counts)
self.probs = np.maximum(self.probs, 1.0)
self.probs *= self.counts
self.probs /= self.probs.sum()
def sample_batches(self, batch_size = 1, train_set_ids = None, random_state = 0):
rng = check_random_state(random_state)
if not train_set_ids == None:
p = self.probs[train_set_ids]
p /= p.sum()
a = train_set_ids
else:
p = self.probs
a = self.num_words
while 1:
rv = rng.choice(a, size=batch_size, replace=True, p=p)
yield rv