-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreduce.py
90 lines (83 loc) · 3.05 KB
/
reduce.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import codecs
import argparse
import sys
def reduce_dataset(fin, fout, max_allowed_word_len=30):
"""
If a line contains a word longer than max allowed length, ignore that line
:param fin: file to read
:param fout: file to write
:param max_allowed_word_len: maximum word length to write the line
"""
input = codecs.open(fin, mode='r', encoding='utf-8')
output = codecs.open(fout, mode='w', encoding='utf-8')
lncnt = 0
with input,output:
for line in input:
line = line.strip()
wd = True
for word in line.split():
if len(word) > max_allowed_word_len:
wd = False
break
if wd:
lncnt+=1
output.write(line+'\n')
print "Number of lines ",lncnt
def parse_morph_dis(fin, fout, START_TAG=u'<S>',END_TAG=u'</S>',SEP=u' '):
"""
Parse MD dataset - save as sentences to fout,
Save as the oracle format to fout.morph
:param fin: file to read
:param fout: file to write
:param START_TAG='<S>': sentence start
:param END_TAG='<S>': sentence end
:param SEP=' ': seperator
"""
input = codecs.open(fin, mode='r', encoding='utf-8-sig')
output_sent = codecs.open(fout, mode='w', encoding='utf-8')
output_sent_morph = codecs.open(fout+".morph", mode='w', encoding='utf-8')
sentcnt = 0
with input,output_sent,output_sent_morph:
for line in input:
line = line.strip()
parts = line.split(SEP)
wrd = parts[0]
correct_tag = parts[1]
if wrd==START_TAG:
sent = []
sent_morph = []
continue
elif wrd==END_TAG:
lsent = " ".join(sent)
lmorph = " ".join(sent_morph)
output_sent.write(lsent+'\n')
output_sent_morph.write(lmorph+'\n')
sentcnt+=1
continue
# if it is another tag
elif wrd.startswith(u'<'):
continue
else:
sent.append(wrd)
morphanalysis = "word:"+wrd+"+lemma:"+correct_tag
sent_morph.append(morphanalysis)
print "Number of sentences ",sentcnt
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--fin', type=str, default='data/originalMD.txt',
help="Raw clean text")
parser.add_argument('--fout', type=str, default='data/originalMD.morph',
help="Out file with reduced set of sentences")
parser.add_argument('--maxlen', type=int, default=30,
help="Max length of a word in a sentence")
parser.add_argument('--op', type=int, default=2,
help="1=reduce, 2=parse morph disamb")
args = parser.parse_args()
if args.op==1:
reduce_dataset(args.fin, args.fout, args.maxlen)
elif args.op==2:
parse_morph_dis(args.fin, args.fout)
else:
sys.exit("Wrong option number")
if __name__ == '__main__':
main()