-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext2phoneme.py
95 lines (74 loc) · 2.47 KB
/
text2phoneme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""日本語文を音素列へ変換。
Ginza や JUMAN++ V2 で文節単位でスペースを空け、pyopenjtalk へ入力。
あらかじめ文節単位で分けておくことで形態素の推定誤りを低減する。
Requirements
-----
- PyKNP
- Ginza
- Juman++ V2
- pyopenjtalk (https://github.com/korguchi/pyopenjtalk)
"""
from pyknp import Juman
import spacy
import pyopenjtalk
def insert_spaces_ginza(text):
nlp = spacy.load("ja_ginza_electra")
doc = nlp(text)
output = []
prev_noun = False
prev_chunk = ""
punctuations = ["、", "。", "!", "?", ";", ":", "・"]
for token in doc:
morpheme = token.text
pos = token.pos_
if pos == "NOUN":
if prev_noun and morpheme not in punctuations:
output.append(" ")
output.append(morpheme)
prev_noun = True
else:
if prev_chunk != "" and morpheme not in punctuations:
output.append(" ")
prev_chunk = ""
output.append(morpheme)
prev_noun = False
if pos == "ADP" or pos == "AUX":
prev_chunk = morpheme
return "".join(output)
def insert_spaces_juman(text):
juman = Juman("jumanpp", multithreading=True)
result = juman.analysis(text)
output = []
prev_noun = False
prev_chunk = ""
punctuations = ["、", "。", "!", "?", ";", ":", "・"]
for mrph in result.mrph_list():
morpheme = mrph.midasi
pos = mrph.hinsi
if pos == "名詞":
if prev_noun and morpheme not in punctuations:
output.append(" ")
output.append(morpheme)
prev_noun = True
else:
if prev_chunk != "" and morpheme not in punctuations:
output.append(" ")
prev_chunk = ""
output.append(morpheme)
prev_noun = False
if pos == "助詞" or pos == "助動詞":
prev_chunk = morpheme
return "".join(output)
def insert_spaces(text, analyzer='ginza'):
if analyzer == 'ginza':
return insert_spaces_ginza(text)
else:
return insert_spaces_juman
def text2phoneme(text):
divided_text = insert_spaces(text)
phonemes = pyopenjtalk.g2p(divided_text)
return phonemes
if __name__ == '__main__':
text = "ゲグァンはこのところ他者を見下すし、ちょっと脅かすか?"
phonemes = text2phoneme(text)
print('sil '+phonemes+' sil')