forked from babylonhealth/fastText_multilingual
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasttext.py
91 lines (73 loc) · 2.98 KB
/
fasttext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#
# Copyright (c) 2017-present, babylon health
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
import numpy as np
class FastVector:
"""
Minimal wrapper for fastvector embeddings.
```
Usage:
$ model = FastVector(vector_file='/path/to/wiki.en.vec')
$ 'apple' in model
> TRUE
$ model['apple'].shape
> (300,)
```
"""
def __init__(self, vector_file='', transform=None):
"""Read in word vectors in fasttext format"""
self.word2id = {}
# Captures word order, only used for export(), so that more frequent words are earlier in the file
self.id2word = []
print('reading word vectors from %s' % vector_file)
with open(vector_file, 'r') as f:
(self.n_words, self.n_dim) = \
(int(x) for x in f.readline().rstrip('\n').split(' '))
self.embed = np.zeros((self.n_words, self.n_dim))
for i, line in enumerate(f):
elems = line.rstrip('\n').split(' ')
self.word2id[elems[0]] = i
self.embed[i] = elems[1:self.n_dim+1]
self.id2word.append(elems[0])
if transform is not None:
print('Applying transformation to embedding')
self.apply_transform(transform)
def apply_transform(self, transform):
"""
Apply the given transformation to the vector space
Right-multiplies given transform with embeddings E:
E = E * transform
Transform can either be a string with a filename to a
text file containing a ndarray (compat. with np.loadtxt)
or a numpy ndarray.
"""
transmat = np.loadtxt(transform) if isinstance(transform, str) else transform
self.embed = np.matmul(self.embed, transmat)
def export(self, outpath):
"""
Transforming a large matrix of WordVectors is expensive.
This method lets you write the transformed matrix back to a file for future use
:param The path to the output file to be written
"""
fout = open(outpath, "w")
# Header takes the guesswork out of loading by recording how many lines, vector dims
fout.write(str(self.n_words) + " " + str(self.n_dim) + "\n")
for token in self.id2word:
vector_components = ["%.6f" % number for number in self[token]]
vector_as_string = " ".join(vector_components)
out_line = token + " " + vector_as_string + "\n"
fout.write(out_line)
fout.close()
@classmethod
def cosine_similarity(cls, vec_a, vec_b):
"""Compute cosine similarity between vec_a and vec_b"""
return np.dot(vec_a, vec_b) / \
(np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
def __contains__(self, key):
return key in self.word2id
def __getitem__(self, key):
return self.embed[self.word2id[key]]