-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_word2vec_model_v020.py
104 lines (82 loc) · 2.85 KB
/
build_word2vec_model_v020.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Filename: my-word-to-vec.py
# Author: #cf
# Version 0.2.0 (2016-10-08)
from pathlib import Path
"""
Function to build word2vec models from text files using gensim.
https://rare-technologies.com/word2vec-tutorial/
"""
##################
# Parameters
##################
WorkDir = Path.cwd()
TextDir = WorkDir.joinpath("data", "frwiki")
ModelFile = WorkDir.joinpath("models", "frwiki.gensim")
Size = 500 # dimensions of the model
##################
# Imports
##################
import os
import re
import gensim
import logging
print("gensim", gensim.__version__)
##################
# Functions
##################
def extract_sentences(TextPath):
"""
Turns a collection of plain text files into a list of lists of word tokens.
"""
print("--extract_sentences")
Sentences = []
for File in os.listdir(TextDir):
with open(File, "r") as InFile:
Text = InFile.read()
Text = re.sub("\n", " ", Text)
Text = re.sub("--", "", Text)
Text = re.sub("\.\.\.", ".", Text)
Text = Text.lower()
SentencesOne = []
Text = re.split("[.!?]", Text)
for Sent in Text:
Sent = re.split("\W", Sent)
Sent = [Token for Token in Sent if Token]
SentencesOne.append(Sent)
Sentences.extend(SentencesOne)
return Sentences
def build_model(TextDir, ModelFile):
"""
Builds a word vector model of the text files given as input.
This should be used for very large collections of text, as it is very memory-friendly.
"""
print("--build_model_new")
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for Para in open(os.path.join(self.dirname, fname)):
if "<doc id" not in Para and "</doc>" not in Para:
Sentences = re.split("[.!?]", Para)
for Sent in Sentences:
Sent = re.split("\W", Sent)
Sent = [Token.lower() for Token in Sent if Token]
Sent = [Token for Token in Sent if len(Token) > 2]
if len(Sent) > 1:
#print(Sent)
yield Sent
Sentences = MySentences(TextDir) # a memory-friendly iterator
Model = gensim.models.Word2Vec(Sentences, min_count=10, size=Size, workers=2)
Model.save(ModelFile)
################
# Main function
################
def main(TextDir, Size, ModelFile):
print("Launched.")
logging.basicConfig(filename="logging.txt", level=logging.INFO)
build_model(TextDir, ModelFile)
print("Done.")
main(TextDir, Size, ModelFile)