-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathg2pc.py
248 lines (211 loc) · 7.64 KB
/
g2pc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#-*- coding:utf8 -*-
'''
g2pC: A context-aware grapheme-to-phoneme module for Chinese
https://github.com/kyubyong/g2pC
kbpark.linguist@gmail.com
'''
import pickle
import re
import os
import pkuseg
from itertools import chain
def convert_hanzi_string_to_number(string):
return "/".join(str(ord(char)) for char in string)
def word2features(sent, i):
word = convert_hanzi_string_to_number(sent[i][0])
postag = sent[i][1]
features = {
'bias': 1.0,
'word':word,
'postag': postag,
}
if i == 0:
features['BOS'] = True
else:
if i > 0:
word1 = convert_hanzi_string_to_number(sent[i-1][0])
postag1 = sent[i-1][1]
features.update({
'-1:word': word1,
'-1:postag': postag1,
})
if i > 1:
word1 = convert_hanzi_string_to_number(sent[i-2][0])
postag1 = sent[i-2][1]
features.update({
'-2:word': word1,
'-2:postag': postag1,
})
if i > 2:
word1 = convert_hanzi_string_to_number(sent[i-3][0])
postag1 = sent[i-3][1]
features.update({
'-3:word': word1,
'-3:postag': postag1,
})
if i == len(sent)-1:
features['EOS'] = True
else:
if i < len(sent)-1:
word1 = convert_hanzi_string_to_number(sent[i+1][0])
postag1 = sent[i+1][1]
features.update({
'+1:word': word1,
'+1:postag': postag1,
})
if i < len(sent)-2:
word1 = convert_hanzi_string_to_number(sent[i+2][0])
postag1 = sent[i+2][1]
features.update({
'+2:word': word1,
'+2:postag': postag1,
})
if i < len(sent)-3:
word1 = convert_hanzi_string_to_number(sent[i+3][0])
postag1 = sent[i+3][1]
features.update({
'+3:word': word1,
'+3:postag': postag1,
})
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def _tone_change(hanzis, pinyins):
'''https://en.wikipedia.org/wiki/Standard_Chinese_phonology#Tone_sandhi
'''
def tone33_to_23(pinyin):
return re.sub("3( [^ ]+?3)", r"2\1", pinyin)
# STEP 1. word-level
## Third tone change
pinyins = [tone33_to_23(pinyin) for pinyin in pinyins]
## when it comes at the end of a multi-syllable word
##(regardless of the first tone of the next word),
## 一 is pronounced with first tone.
_pinyins = []
for hanzi, pinyin in zip(hanzis, pinyins):
if len(hanzi)>1 and hanzi[0]=="一":
pinyin = re.sub("yi1$", "YI1", pinyin)
_pinyins.append(pinyin)
# STEP 2. phrase-level
## remove boundaries
hanzis = "".join(hanzis)
pinyins = " ".join(_pinyins)
## Third tone change
pinyins = tone33_to_23(pinyins)
pinyins = pinyins.split()
hanzis_prev = "^" + hanzis[:-1]
pinyins_prev = ["^"] + pinyins[:-1]
hanzis_next = hanzis[1:] + "$"
pinyins_next = pinyins[1:] + ["$"]
_pinyins = []
for h, h_prev, h_next, p, p_prev, p_next in zip(hanzis, hanzis_prev, hanzis_next, pinyins, pinyins_prev, pinyins_next):
if h == "一" and p == "yi1":
if h_prev in "第初图表卷":
p = "YI1"
elif h_prev == "头" and h_next == "回":
p = "YI1"
elif h_prev == "末" and h_next == "次":
p = "YI1"
elif h_next in "号楼更等级一二三四五陆七八九十廿卅卌百皕千万亿":
p = "YI1"
elif h_prev == h_next: # 4. A一A -> A yi5 A
p = "yi5"
elif p_next[-1] == "4": # 一 + A4 -> yi2 A4
p = "yi2"
elif p_next[-1] in "123": # 一 + A{1,2,3} -> yi4 A{1,2,3}
p = "yi4"
if h == "不" and p=="bu4":
if p_next[-1] == "4": # 不 + A4 -> bu2 A4
p = "bu2"
elif h_prev == h_next: # A不A -> A bu5 A
p = "bu5"
p = p.replace("YI", "yi")
_pinyins.append(p)
return _pinyins
def tone_change(results):
hanzis = [result[0] for result in results]
pinyins = [result[2] for result in results]
pinyins = _tone_change(hanzis, pinyins)
# align
rule_applied = []
for result in results:
n_syls = len(result[2].split())
_pinyin = " ".join(pinyins[:n_syls])
pinyins = pinyins[n_syls:]
result = (result[0], result[1], result[2], _pinyin, result[3], result[4])
rule_applied.append(result)
return rule_applied
class G2pC(object):
def __init__(self):
'''
self.cedict looks like:
{行: {pron: [hang2, xing2],
meaning: [/row/line, /to walk/to go],
trad: [行, 行]}
'''
self.seg = pkuseg.pkuseg(postag=True)
self.cedict = pickle.load(open(os.path.dirname(os.path.abspath(__file__)) + '/cedict.pkl', 'rb'))
self.crf = pickle.load(open(os.path.dirname(os.path.abspath(__file__)) + '/crf100.bin', 'rb'))
def __call__(self, string):
# fragment into sentences
sents = re.sub("([!?。])", r"\1[SEP]", string)
sents = sents.split("[SEP]")
_sents = []
for sent in sents:
if len(sent)==0: continue
# STEP 1
tokens = self.seg.cut(sent)
# STEP 2
analyzed = []
for word, pos in tokens:
if word in self.cedict:
features = self.cedict[word]
prons = features["pron"]
meanings = features["meaning"]
trads = features["trad"]
analyzed.append((word, pos, prons, meanings, trads))
else:
for char in word:
if char in self.cedict:
features = self.cedict[char]
prons = features["pron"]
meanings = features["meaning"]
trads = features["trad"]
else:
prons = [char]
meanings = [""]
trads = [char]
analyzed.append((char, pos, prons, meanings, trads))
_sents.append(analyzed)
# print("STEP1", tokens)
# print("STEP2", analyzed)
# STEP 3
features = [sent2features(_sent) for _sent in _sents]
preds = self.crf.predict(features)
# concatenate sentences
tokens = chain.from_iterable(_sents)
preds = chain.from_iterable(preds)
# determine pinyin
ret = []
for (word, pos, prons, meanings, trads), p in zip(tokens, preds):
# print(word, pos, prons, p)
p = p.replace("-", " ")
if p in prons:
pinyin = p
else:
pinyin = prons[0]
ind = prons.index(pinyin)
meaning = meanings[ind]
trad = trads[ind]
ret.append((word, pos, pinyin, meaning, trad))
# print("STEP3", ret)
# STEP 4
ret = tone_change(ret)
return ret
if __name__ == "__main__":
strings = ["有一次", "第一次", "十一二岁来到戏校", "同年十一月", "一九八二年英文版", "欧洲统一步伐", "吉林省一号工程", "一是选拔优秀干部"]
g2p = G2pC()
for string in strings:
results = g2p(string)
change = [each[3] for each in results]
print(string, "/", "|".join(change))