-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_word2vec_model.py
45 lines (35 loc) · 1.33 KB
/
create_word2vec_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Run this script direclty in the terminal to get the logging output:
# write "python word2vec_souhack.py"
# NOTE! Download textfile to run on from
# http://www.christopherkullenberg.se/sorlet-fran-statens-offentliga-utredningar/
import regex
import logging
import gensim
from gensim import corpora, models
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-i","--input", help="""
Name for the file you want to analyze e.g. "SOUtxtAllBigFile.txt"
The file is supposed to be stored in the current folder if not specified
specificly.""",default="",required=True)
parser.add_argument("-o","--output", help="""
Name for the model file you create e.g. "20talet.model"
The file is stored in the current folder if not specified
specificly.""",default="./my_model.model",required=True)
args = parser.parse_args()
outfile = args.output
infile = args.input
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class MySentences:
def __init__(self, infile):
self.infile = infile
def __iter__(self):
for line in open(self.infile):
cleaned = regex.sub(r"[\:\!\?\.\,]","",line)
tokens = cleaned.split()
yield tokens
sentences = MySentences(infile)
model = models.Word2Vec(sentences,size=200,window=5,min_count=30,workers=4)
model.save(outfile)