-
Notifications
You must be signed in to change notification settings - Fork 3
/
unsupervised_nlputils.py
41 lines (33 loc) · 1.53 KB
/
unsupervised_nlputils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import sys, math, argparse, re
from khaiii import KhaiiiApi
import mecab
def khaiii_tokenize(corpus_fname, output_fname):
api = KhaiiiApi()
with open(corpus_fname, 'r', encoding='utf-8') as f1, \
open(output_fname, 'w', encoding='utf-8') as f2:
for line in f1:
sentence = line.replace('\n', '').strip()
tokens = api.analyze(sentence)
tokenized_sent = ''
for token in tokens:
tokenized_sent += ' '.join([str(m) for m in token.morphs]) + ' '
f2.writelines(tokenized_sent.strip() + '\n')
def mecab_tokenize(corpus_fname, output_fname):
mcab = mecab.MeCab()
with open(corpus_fname, 'r', encoding='utf-8') as f1, \
open(output_fname, 'w', encoding='utf-8') as f2:
for line in f1:
sentence = line.replace('\n', '').strip()
tokens = mcab.morphs(sentence)
tokenized_sent = ' '.join(tokens)
f2.writelines(tokenized_sent + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--preprocess_mode', type=str, help='preprocess mode')
parser.add_argument('--input_path', type=str, help='Location of input files')
parser.add_argument('--output_path', type=str, help='Location of output files')
args = parser.parse_args()
if args.preprocess_mode == "khaiii_tokenize":
khaiii_tokenize(args.input_path, args.output_path)
elif args.preprocess_mode == "mecab_tokenize":
mecab_tokenize(args.input_path, args.output_path)