forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
europarl2vrt.py
executable file
·62 lines (47 loc) · 1.66 KB
/
europarl2vrt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import codecs
import re
from optparse import OptionParser
def getopts():
optparser = OptionParser()
optparser.add_option('--tokenize', default=False)
optparser.add_option('--add-links', '--add-link-elements', default=False)
(opts, args) = optparser.parse_args()
return (opts, args)
def process_input(args, opts, input_encoding):
if len(args) > 0:
for fname in args:
with codecs.open(fname, 'r', encoding=input_encoding) as f:
process_file(f, opts)
else:
process_file(sys.stdin, opts)
def process_file(f, opts):
sentnr = 1
for line in f:
sys.stdout.write('<sentence id="' + str(sentnr) + '">\n')
if opts.add_links:
sys.stdout.write('<link id="' + str(sentnr) + '">\n')
sys.stdout.write(tokenize(line[:-1], opts))
if opts.add_links:
sys.stdout.write('</link>\n')
sys.stdout.write('</sentence>\n')
sentnr += 1
def tokenize(text, opts):
text = text or ''
if opts.tokenize:
text = re.sub(r'([.?!,:])(")', r'\1 \2', text)
text = re.sub(r'(\.\.\.)([,:;?!")])', r' \1 \2', text)
text = re.sub(r'([.,:;?!")]|\.\.\.)([ \n]|\Z)', r' \1\2', text)
text = re.sub(r'([ \n]|\A)(["(])', r'\1\2 ', text)
return '\n'.join(text.split()) + '\n'
def main():
input_encoding = 'utf-8'
output_encoding = 'utf-8'
sys.stdin = codecs.getreader(input_encoding)(sys.stdin)
sys.stdout = codecs.getwriter(output_encoding)(sys.stdout)
(opts, args) = getopts()
process_input(args, opts, input_encoding)
if __name__ == "__main__":
main()