-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathrefinery.py
57 lines (43 loc) · 1.33 KB
/
refinery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sys
import re
import codecs
if __name__ == "__main__":
inputFilename = sys.argv[1]
inputEncoding = sys.argv[2]
outputFilename = sys.argv[3]
outputEncoding = sys.argv[4]
referenceFilename = sys.argv[5] if len(sys.argv) > 5 else None
columnIndicator = sys.argv[6] if len(sys.argv) > 6 else None
regexs = []
if referenceFilename != None:
f = codecs.open(referenceFilename, "r", "utf-8")
for line in f:
if line.strip() != "":
if line[0] == u"#":
continue
if line[-2:] == u"\r\n":
line = line[:-2]
if line[-1] == u"\r" or line[-1] == u"\n":
line = line[:-1]
tokens = line.split("\t")
print tokens
regexs.append((tokens[0], tokens[1]))
f.close()
print "Start converting."
f = codecs.open(inputFilename, "r", inputEncoding)
f2 = codecs.open(outputFilename, "w", outputEncoding)
for line in f:
try:
if line.strip() != "":
tmpLine = ""
tokens = line.strip().split("\t")
for i, token in enumerate(tokens):
if columnIndicator == None or (len(columnIndicator) > i and columnIndicator[i] == "1"):
for src, dest in regexs:
token = re.sub(src, dest, token) + "\t"
tmpLine = tmpLine + token + "\t"
f2.write(tmpLine.strip() + "\n")
except Exception, e:
print e
f2.close()
f.close()