-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathvec2bin.py
63 lines (50 loc) · 1.57 KB
/
vec2bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/python
import sys
import getopt
import numpy as np
from tqdm import tqdm
input_path = 'wiki.fr.vec'
output_path = 'wifi.fr.bin'
def vec2bin(input_path, output_path):
input_fd = open(input_path, "rb")
output_fd = open(output_path, "wb")
header = input_fd.readline()
output_fd.write(header)
vocab_size, vector_size = map(int, header.split())
for line in tqdm(range(vocab_size)):
word = []
while True:
ch = input_fd.read(1)
output_fd.write(ch)
if ch == b' ':
word = b''.join(word).decode('utf-8')
break
if ch != b'\n':
word.append(ch)
vector = np.fromstring(input_fd.readline(), sep=' ', dtype='float32')
output_fd.write(vector.tostring())
input_fd.close()
output_fd.close()
def main(argv):
inputfile = False
outputfile = False
try:
opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
except getopt.GetoptError:
print('vec2bin.py -i <inputfile> -o <outputfile>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('test.py -i <inputfile> -o <outputfile>')
sys.exit()
elif opt in ("-i", "--ifile"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
if not inputfile or not outputfile:
print('vec2bin.py -i <inputfile> -o <outputfile>')
sys.exit(2)
print('Converting %s to binary file format' % inputfile)
vec2bin(inputfile, outputfile)
if __name__ == "__main__":
main(sys.argv[1:])