-
Notifications
You must be signed in to change notification settings - Fork 0
/
reading_splitter.py
108 lines (86 loc) · 3.35 KB
/
reading_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import sqlite3
import Levenshtein
from needleman_wunsch import needleman_wunsch as align
import common
conn = sqlite3.connect("kanjidic.db")
def process_reading(reading):
hiragana = common.to_hiragana(reading)
hiragana = hiragana.split(".")[0].strip("-")
generated_readings = {hiragana}
def generate_extra_readings(hiragana):
# handle sokuon
if hiragana[-1] in "きくちつ":
yield hiragana[:-1] + "っ"
# handle rendaku
if hiragana[0] in "かきくけこさしすせそたちつてとはひふへほ":
yield chr(ord(hiragana[0]) + 1) + hiragana[1:]
# handle handakuten for ha row
if hiragana[0] in "はひふへほ":
yield chr(ord(hiragana[0]) + 2) + hiragana[1:]
# handle homophones for dzi, dzu
if hiragana[0] in "ちつ":
yield {"ち": "じ", "つ": "ず"}[hiragana[0]] + hiragana[1:]
old = set()
while generated_readings != old:
old = generated_readings
generated_readings = generated_readings | {new_reading
for old_reading in generated_readings
for new_reading in generate_extra_readings(old_reading)}
return generated_readings
def get_readings(kanji):
c = conn.cursor()
readings = {processed_reading
for reading, reading_type in c.execute(
"SELECT reading, type from readings where kanji=?", (kanji,))
for processed_reading in process_reading(reading)}
readings.add("?")
return readings
def split_reading(kanji, kana, max_distance=1, return_score=False):
def generate_readings(kanji, cur, i):
if i < len(kanji):
if kanji[i] == "々":
readings = get_readings(kanji[i-1])
else:
readings = get_readings(kanji[i])
for reading in readings:
yield from generate_readings(kanji, cur+[(kanji[i],reading)], i+1)
else:
yield cur
def similarity(a,b):
return Levenshtein.distance(a,b)
scores = []
for reading in generate_readings(kanji, [], 0):
a, b = zip(*reading)
read = ''.join(b)
score = similarity(read, kana)
#print("%s (score=%d)" % (read,score))
scores.append((reading,score))
scores = sorted(scores, key=lambda s: s[1])
#print()
r,s = zip(*scores)
minScore = min(s)
result = r[s.index(minScore)]
if minScore == 0:
return (result, minScore) if return_score else result
res = approximate_split(scores, kana, max_distance)
if res:
return (res, minScore) if return_score else res
res = [(kanji, kana)]
return (res, minScore) if return_score else res
def approximate_split(readings, kana, max_distance):
def similarity(a,b):
if a==b:
return 1
if a==' ' or b==' ':
return -10
return -1
for result, score in readings:
if score > max_distance:
break
q = kana
w = ' '.join([y for _,y in result])
l, r = next(align(q, w, d=-1, fill=" ", s=similarity))
res = filter(None, l.split(' '))
res = list(zip([x for x,_ in result], res))
if len(res) == len(result):
return res