-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_on_corpus.py
123 lines (95 loc) · 4.11 KB
/
run_on_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import argparse
import re
import sys
import traceback
import pydoc
import common
import kanamatcher
def main(*argv):
parser = argparse.ArgumentParser(description="Test aligner on corpus.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('kanji_corpus', help="Corpus including kanji")
parser.add_argument('kana_corpus', help="Corpus only including kana")
parser.add_argument('--all', action="store_true", default=False, help="Run on whole corpus at once")
parser.add_argument('--skip', type=int, default=0, help="Number of lines at the start of the corpus to skip")
parser.add_argument('--missing-ruby-penalty', type=int,
default=kanamatcher.NO_RUBY_PENALTY, help="Penalty for a kanji missing ruby")
parser.add_argument('--kana-mismatch-penalty', type=int, default=kanamatcher.KANA_MISMATCH_PENALTY,
help="Penalty for a mismatch between the given kana and the generated ruby")
parser.add_argument('--alignments-to-test', type=int,
default=kanamatcher.MAX_NUM_ALIGNMENTS, help="Max number of alignments to test")
parser.add_argument('--save-output', default=None, help="Save statistics and scores to file")
args = parser.parse_args()
kanamatcher.NO_RUBY_PENALTY = args.missing_ruby_penalty
kanamatcher.KANA_MISMATCH_PENALTY = args.kana_mismatch_penalty
kanamatcher.MAX_NUM_ALIGNMENTS = args.alignments_to_test
print_all = args.all
strip_whitespace = re.compile(r"\s")
kanji_file = args.kanji_corpus
kana_file = args.kana_corpus
lines = 0
errors = 0
total_score = 0
bad_lines = []
try:
with open(kanji_file, encoding="utf-8-sig") as kjf, \
open(kana_file, encoding="utf-8-sig") as knf:
skip_lines = 0
for kanji, kana in zip(kjf, knf):
if kanji.strip() == '':
continue
lines += 1
if args.skip > 0:
args.skip -= 1
continue
print("{}.".format(lines))
kanji, kana = strip_whitespace.sub("", kanji), strip_whitespace.sub("", kana)
print(kanji)
print(kana)
try:
result, score = kanamatcher.match_kana(kanji, kana, return_score=True)
except Exception as e:
errors += 1
traceback.print_exc()
print()
continue
total_score += score
output = "\n".join(kanamatcher.pretty_print(result))
print(output)
print("Score:", score)
if score > 0:
bad_lines.append((score, lines, kanji, kana, output))
if print_all or skip_lines > 0:
if skip_lines > 0:
skip_lines -= 1
print()
else:
l = input("Press enter or input the number of lines to skip: ")
try:
skip_lines = max(0, int(l))
except ValueError:
pass
except KeyboardInterrupt:
pass
bad_lines.sort(reverse=True)
std_output = ("================\n" +
"Errors during parsing: {}/{}\n".format(errors, lines) +
"Total score: {}\n".format(total_score) +
"Average score: {}\n".format(total_score / lines) +
"================\n")
pager_output = ""
for score, line, kanji, kana, output in bad_lines:
pager_output += "{}.\n{}\n{}\n{}\nScore: {}\n\n".format(
line, kanji, kana, output, score)
if args.save_output is not None:
try:
with open(args.save_output, 'w') as f:
f.write(std_output)
f.write(pager_output)
except:
traceback.print_exc()
print(std_output)
input("Press enter to view bad lines: ")
pydoc.pager(pager_output)
if __name__ == '__main__':
main(*sys.argv[1:])