-
Notifications
You must be signed in to change notification settings - Fork 7
/
buildOffsets.py
54 lines (43 loc) · 1.07 KB
/
buildOffsets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import gensim
from gensim import corpora, models, similarities
from gensim.corpora import WikiCorpus, MmCorpus
from gensim.corpora import dictionary
import logging
from gensim.models.word2vec import BrownCorpus, Word2Vec
from gensim.models import Phrases
import nltk, os
def offset(pair):
if pair[0] in model.vocab:
if pair[1] in model.vocab:
return model[pair[0]] - model[pair[1]]
else:
print(pair[1]+" missing")
return []
print(pair[0]+" missing")
return []
def toStr(offs):
s=""
for o in offs:
s=s+ ","+ str('%f' % o)
return s
SIZE=300
modelName="sg0HS0Size"+str(SIZE)
model=Word2Vec.load("models/"+modelName)
pairList="pairs.csv"
offsetOutFile="offsets/"+ modelName+".csv"
print("Offsets file will be "+offsetOutFile+"\n")
out=open(offsetOutFile,"w")
out.write("w1,w2,rel")
for i in range(1, SIZE+1):
x=out.write(",V"+str(i))
out.write("\n")
for line in open(pairList):
line=line.strip().lower().split(",")
w1 = line[0]
w2 = line[1]
rel=line[2]
temp= toStr(offset((w1,w2)))
if(temp):
x=out.write(w1+","+w2+","+ rel+temp+"\n")
out.close()