-
Notifications
You must be signed in to change notification settings - Fork 240
/
04_fasttext_train_vectors.py
135 lines (124 loc) · 6 KB
/
04_fasttext_train_vectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
from typing import Optional
from pathlib import Path
from wasabi import msg
import fasttext
from errno import EPIPE
import typer
# python 04_fasttext_train_vectors.py /path/to/output/director/ -in /path/to/input/directory
def main(
# fmt: off
out_dir: str = typer.Argument(..., help="Path to output directory"),
in_dir: Optional[str] = typer.Argument(None, help="Path to directory with preprocessed .s2v file(s)"),
n_threads: int = typer.Option(10, "--n-threads", "-t", help="Number of threads"),
min_count: int = typer.Option(50, "--min-count", "-c", help="Minimum count for inclusion in vocab"),
vector_size: int = typer.Option(300, "--vector-size", "-s", help="Dimension of word vector representations"),
epoch: int = typer.Option(5, "--epoch", "-e", help="Number of times the fastText model will loop over your data"),
save_fasttext_model: bool = typer.Option(False, "--save-fasttext-model", "-sv", help="Save fastText model to output directory as a binary file to avoid retraining"),
fasttext_filepath: Optional[str] = typer.Option(None, "--fasttext-filepath", "-ft", help="Path to saved fastText model .bin file"),
verbose: int = typer.Option(2, "--verbose", "-v", help="Set verbosity: 0, 1, or 2"),
# fmt: on
):
"""
Step 4: Train the vectors
Expects a directory of preprocessed .s2v input files, will concatenate them
(using a temporary file on disk) and will use fastText to train a word2vec
model. See here for installation instructions:
https://github.com/facebookresearch/fastText
Note that this script will call into fastText and expects you to pass in the
built fasttext binary. The command will also be printed if you want to run
it separately.
"""
output_path = Path(out_dir)
if not output_path.exists():
output_path.mkdir(parents=True)
msg.good(f"Created output directory {out_dir}")
if fasttext_filepath:
msg.info("Loading fastText model vectors from .bin file")
if in_dir:
msg.warn(
f"Warning: Providing a fastText filepath overrides fastText vector training"
)
fasttext_filepath = Path(fasttext_filepath)
if (
not fasttext_filepath.exists()
or not fasttext_filepath.is_file()
or not (fasttext_filepath.suffix == ".bin")
):
msg.fail(
"Error: fasttext_filepath expects a fastText model .bin file", exits=1
)
fasttext_model = fasttext.load_model(str(fasttext_filepath))
msg.good("Successfully loaded fastText model")
elif in_dir:
msg.info("Training fastText model vectors")
input_path = Path(in_dir)
# Check to see if fasttext_filepath exists
if not input_path.exists() or not input_path.is_dir():
msg.fail("Not a valid input directory", in_dir, exits=1)
tmp_path = input_path / "s2v_input.tmp"
input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"]
if not input_files:
msg.fail("Input directory contains no .s2v files", in_dir, exits=1)
# fastText expects only one input file and only reads from disk and not
# stdin, so we need to create a temporary file that concatenates the inputs
with tmp_path.open("a", encoding="utf8") as tmp_file:
for input_file in input_files:
with input_file.open("r", encoding="utf8") as f:
tmp_file.write(f.read())
msg.info("Created temporary merged input file", tmp_path)
fasttext_model = fasttext.train_unsupervised(
str(tmp_path),
thread=n_threads,
epoch=epoch,
dim=vector_size,
minn=0,
maxn=0,
minCount=min_count,
verbose=verbose,
)
msg.good("Successfully trained fastText model vectors")
tmp_path.unlink()
msg.good("Deleted temporary input file", tmp_path)
output_file = output_path / f"vectors_w2v_{vector_size}dim.bin"
if save_fasttext_model:
fasttext_model.save_model(str(output_file))
if not output_file.exists() or not output_file.is_file():
msg.fail("Failed to save fastText model to disk", output_file, exits=1)
msg.good("Successfully saved fastText model to disk", output_file)
else:
fasttext_model = None
msg.fail("Must provide an input directory or fastText binary filepath", exits=1)
msg.info("Creating vocabulary file")
vocab_file = output_path / "vocab.txt"
words, freqs = fasttext_model.get_words(include_freq=True)
with vocab_file.open("w", encoding="utf8") as f:
for i in range(len(words)):
f.write(words[i] + " " + str(freqs[i]) + " word\n")
if not vocab_file.exists() or not vocab_file.is_file():
msg.fail("Failed to create vocabulary", vocab_file, exits=1)
msg.good("Successfully created vocabulary file", vocab_file)
msg.info("Creating vectors file")
vectors_file = output_path / "vectors.txt"
# Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31
with vectors_file.open("w", encoding="utf-8") as file_out:
# the first line must contain the number of total words and vector dimension
file_out.write(
str(len(words)) + " " + str(fasttext_model.get_dimension()) + "\n"
)
# line by line, append vector to vectors file
for w in words:
v = fasttext_model.get_word_vector(w)
vstr = ""
for vi in v:
vstr += " " + str(vi)
try:
file_out.write(w + vstr + "\n")
except IOError as e:
if e.errno == EPIPE:
pass
if not vectors_file.exists() or not vectors_file.is_file():
msg.fail("Failed to create vectors file", vectors_file, exits=1)
msg.good("Successfully created vectors file", vectors_file)
if __name__ == "__main__":
typer.run(main)