forked from SNUDerek/MLsnippets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedding.py
53 lines (41 loc) · 1.7 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import codecs, json
import numpy as np
from gensim.models import Word2Vec
from keras.layers import Embedding
# tokenizing function
def tokenize(sentence):
result = sentence.replace('\n', '').split(' ')
return(result)
# create embeddings with gensim
def create_embeddings(file_name,
embeddings_path='temp_embeddings/embeddings.gensimmodel',
vocab_path='temp_embeddings/mapping.json',
**params):
class SentenceGenerator(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for line in codecs.open(self.filename, 'rU', encoding='utf-8'):
yield tokenize(line)
sentences = SentenceGenerator(file_name)
model = Word2Vec(sentences, **params)
model.save(embeddings_path)
# weights = model.syn0
# np.save(open(embeddings_path, 'wb'), weights)
# http://stackoverflow.com/questions/35596031/gensim-word2vec-find-number-of-words-in-vocabulary
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
with open(vocab_path, 'w') as f:
f.write(json.dumps(vocab))
return vocab, model
# load vocabulary index from json file
def load_vocab(vocab_path='temp_embeddings/mapping.json'):
with open(vocab_path, 'r') as f:
data = json.loads(f.read())
word2idx = data
idx2word = dict([(v, k) for k, v in data.items()])
return word2idx, idx2word
# embedding layer function
def word2vec_embedding_layer(embeddings_path='temp_embeddings/embeddings.npz'):
weights = np.load(open(embeddings_path, 'rb'))
layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights])
return layer