-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrain_w2v.py
61 lines (51 loc) · 2.09 KB
/
train_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
from c19 import word2vec_utilities, parameters, database_utilities, text_preprocessing
import os
"""
The trick is to generate a DB of articles without sentence embedding.
It will insert Null instead of the vector.
Then, all pre-processed sentences are used to train W2V.
"""
def main():
params = parameters.Parameters(database=parameters.Database(
local_path="articles_database_v8_07042020.sqlite",
kaggle_data_path="kaggle_data"))
# Load all articles (title, abstract and body) into the 'article' table.
database_utilities.create_db_and_load_articles(
db_path=params.database.local_path,
kaggle_data_path=params.database.kaggle_data_path,
first_launch=params.first_launch,
only_newest=params.database.only_newest,
only_covid=params.database.only_covid,
enable_data_cleaner=params.database.enable_data_cleaner)
# Pre-process all sentences (no embedding)
text_preprocessing.pre_process_and_vectorize_texts(
embedding_model=None,
db_path=params.database.local_path,
first_launch=params.first_launch,
stem_words=params.preprocessing.stem_words,
remove_num=params.preprocessing.remove_numeric,
batch_size=params.preprocessing.batch_size,
max_body_sentences=params.preprocessing.max_body_sentences)
# Param have been set up with: https://www.aclweb.org/anthology/W16-2922.pdf
w2v_params = {
"sg": 1,
"hs": 1,
"sample": 1e-5,
"negative": 10,
"min_count": 20,
"size": 100,
"window": 7,
"seed": 42,
"workers": os.cpu_count(),
"iter": 10
}
# Train and save W2V and TFIDF as a parquet file DF.parquet
word2vec = word2vec_utilities.W2V(params.database.local_path,
tfidf_path="TFIDF.pkl",
w2v_path="W2V.bin",
w2v_params=w2v_params,
parquet_output_path="DF.parquet")
word2vec.train()
if __name__ == "__main__":
main()