-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_embedding.py
91 lines (62 loc) · 2.12 KB
/
word_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from gensim.models.keyedvectors import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from tqdm import tqdm
def get_model():
print('Loading model...')
return KeyedVectors.load_word2vec_format("./models/gensim_glove_vectors.txt", binary=False)
def get_words():
print('Opening wordlist...')
df = pd.read_pickle('./results/topics.pkl')
return df
'''
words = []
for topic_group in df['topics']:
book_topics = []
for topic in topic_group:
words.append(topic[0])
return words
'''
def words_coordinates(word_list, word_vectors):
word_plus_coordinates=[]
for word in word_list:
try:
current_row = []
current_row.append(word)
current_row.extend(word_vectors[word])
word_plus_coordinates.append(current_row)
except:
print(' ' + word + ' was not found in model.')
return pd.DataFrame(word_plus_coordinates)
def add_embeddings():
glove_model = get_model()
df = get_words()
data = pd.DataFrame(columns=['x', 'y', 'book_num', 'word', 'coords'])
print('Transposing topic words into rows...')
for i, row in tqdm(df.iterrows()):
for word in row['topic_words']:
data = data.append({
'book_num' : int(row['number']),
'word': word,
}, ignore_index=True)
print('Adding word embeddings...')
for i, row in data.iterrows():
try:
coords = glove_model[row['word']]
data.at[i, 'coords'] = coords
except:
data = data.drop(i)
#print(' ' + row['word'] + ' not in model.')
data = data.reset_index()
del data['index']
print('Reducing dimensions...')
flat_coords = TSNE(n_components=2).fit_transform(data.coords.tolist())
for i,row in data.iterrows():
data.at[i, 'x'] = flat_coords[i][0]
data.at[i, 'y'] = flat_coords[i][1]
print(data)
data.to_pickle('./results/coords_and_embeddings.pkl')
return data
add_embeddings()