forked from hlt-bme-hu/definition-normalizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimple_tokenizer.py
executable file
·56 lines (45 loc) · 1.51 KB
/
simple_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python2.7
from sys import stdin, stdout
from argparse import ArgumentParser
import re
word_re = re.compile(ur'^[A-Za-z\-\'\u2019\u00e9]+$', re.UNICODE)
def setup_parser():
parser = ArgumentParser()
parser.add_argument('-s', '--separator', dest='sep', type=str,
default=' ', help='separator on right side')
parser.add_argument('-l', '--lower', dest='lower', action='store_true', default=False,
help='lower all words')
parser.add_argument('-e', '--encoding', dest='encoding', type=str, default='utf8',
help='input encoding. Output is always UTF8')
return parser
def tokenize_line(line, cfg):
fd = line.split('\t')
left = normalize_left(fd[0], cfg)
right = '\t'.join(fd[1:])
words = remove_spec(right, cfg).split(cfg.sep)
words_norm = list()
for w in words:
w_ = normalize_word(w, cfg)
if w_:
words_norm.append(w_)
return [left] + words_norm
def normalize_left(left, cfg):
return left
def remove_spec(line, cfg):
l = line.replace('.', '')
l = l.replace(':', '')
if cfg.lower:
l = l.lower()
return l
def normalize_word(word, cfg):
if not word_re.match(word):
return None
return word
def main():
parser = setup_parser()
cfg = parser.parse_args()
for l in stdin:
tok = tokenize_line(l.decode(cfg.encoding, 'ignore'), cfg)
stdout.write('\t'.join(tok).encode('utf8') + '\n')
if __name__ == '__main__':
main()