-
Notifications
You must be signed in to change notification settings - Fork 0
/
mandarin.py
109 lines (88 loc) · 4.6 KB
/
mandarin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import pypinyin
import zhconv
from common import MakeDict
from pypinyin.style._tone_convert import tone3_to_tone
class MakeMandarin(MakeDict):
def __init__(self, out_path, overwrite_pinyin, add_pinyin, transdict, user_dict, force_mono):
super().__init__(out_path, overwrite_pinyin, add_pinyin, transdict, user_dict, force_mono)
def load_dict(self):
# 获取词组
with open("data/cedict_ts.u8", "r", encoding="utf-8") as f:
for line in f:
res = re.search(r"(.*[\u4e00-\u9fa5]) (.*[\u4e00-\u9fa5]) \[([\w :]+)] (\{([\w :]+)})?", line)
char = res.group(2) if res else None
if char and not re.search("[0-9A-Za-z·:,]", char):
if len(char) > 1 and res.group(3):
pinyin = res.group(3).lower().replace("u:", "v")
values = ["er5" if i == "r5" else i for i in pinyin.split(" ")]
self.phrase_pinyin_dict[char] = " ".join(values)
def fill_unicode_pinyin(self):
char_list = [chr(i) for i in range(0x4E00, 0x9FFF + 1)] + list(self.transdict.values())
for i in char_list:
text = zhconv.convert(i, "zh-cn")
if self.default_pinyin.get(text) is None:
pinyin = pypinyin.pinyin(text, style=pypinyin.TONE3)[0][0]
if re.search(r"([^a-z\d])", pinyin) is None:
self.default_pinyin[text] = [pinyin]
def make_dict(self):
self.pos_dict.clear()
with open(f"{self.out_path}/phrases_dict.txt", "w", encoding='utf-8') as f:
for raw_phrase, raw_pinyin in self.phrases_dict_out.items():
clip_pinyin = raw_pinyin.split(" ")
phrase_size = len(raw_phrase)
skip = False
for text in self.force_mono:
if text in raw_phrase:
skip = True
break
if 1 < phrase_size == len(clip_pinyin) <= 4 and not skip:
tonePinyin = ",".join([tone3_to_tone(x) for x in clip_pinyin])
f.write(f"{raw_phrase}:{tonePinyin}\n")
for i, (text, pinyin) in enumerate(zip(raw_phrase, clip_pinyin)):
if text in self.map_keys:
self.pos_dict.setdefault(text, []).append(phrase_size)
with open(f"{self.out_path}/phrases_map.txt", "w", encoding='utf-8') as f:
for k, v in self.pos_dict.items():
if k not in self.force_mono:
map_pos = "".join([str(x) for x in list(set(v))])
f.write(f"{k}:{map_pos}\n")
with open(f"{self.out_path}/word.txt", "w", encoding='utf-8') as f:
for k, v in self.default_pinyin.items():
if len(k) == 1:
v_list = ",".join([tone3_to_tone(item) for item in v if item])
f.write(f"{k}:{v_list}\n")
with open(f"{self.out_path}/trans_word.txt", "w", encoding='utf-8') as f:
for k, v in self.default_pinyin.items():
t_k = zhconv.convert(k, "zh-hant")
if t_k != k:
f.write(f"{t_k}:{k}\n")
for k, v in self.transdict.items():
if k != v and k not in self.default_pinyin.keys() and v in self.default_pinyin.keys():
f.write(f"{k}:{v}\n")
out_path = "dict/mandarin"
overwrite_pinyin = {
"儿": "er5", "了": "le5", "呢": "ne5", "曾": "ceng2", "重": "chong2", "地": "de5", "藏": "cang2", "都": "dou1",
"还": "hai2", "弹": "tan2", "着": "zhe5", "的": "de5", "哦": "o4", "盛": "sheng4", "哟": "yo5", "喔": "o1",
"湮": "yan1", "拓": "ta4", "系": "xi4", "谁": "shei2", "什": "shen2", "么": "me5", "扛": "kang2", "攒": "zan3",
"嗯": "n4", "喳": "zha1", "哋": "di4", "嘅": "ge3", "粘": "zhan1", "恁": "nen4", "嗌": "ai4", "褪": "tui4"
}
extra_pinyin = {"濛": "meng2", "尅": "kei2"}
force_mono = ["喳", "褪"]
chinese_transdict = {}
with open("data/fanjian.txt", "r", encoding="utf-8") as f:
for line in f:
k, v = line.strip('\n').split(' ')
if len(k) == 1:
chinese_transdict[k] = v
with open("data/fanjian2.txt", "r", encoding="utf-8") as f:
for line in f:
k, v = line.strip('\n').split(',')
if len(k) == 1:
chinese_transdict[k] = v
user_dict = {}
with open("data/man_user.txt", "r", encoding="utf-8") as f:
for line in f:
k, v = line.strip('\n').split(':')
user_dict[k] = v
MakeMandarin(out_path, overwrite_pinyin, extra_pinyin, chinese_transdict, user_dict, force_mono)