forked from fishaudio/Bert-VITS2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
onnx_infer.py
71 lines (57 loc) · 2.27 KB
/
onnx_infer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from onnx_modules.V230_OnnxInference import OnnxInferenceSession
import soundfile as sf
import commons
from text import cleaned_text_to_sequence, get_bert
from text.cleaner import clean_text
import numpy as np
Session = OnnxInferenceSession(
{
"enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.quant.onnx",
"emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.quant.onnx",
"dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.quant.onnx",
"sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.quant.onnx",
"flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.quant.onnx",
"dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.quant.onnx",
},
Providers=["CPUExecutionProvider"],
)
def get_text(text, language_str, style_text=None, style_weight=0.5):
style_text = None if style_text == "" else style_text
# 在此处实现当前版本的get_text
norm_text, phone, tone, word2ph = clean_text(text, language_str)
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
# add blank
phone = commons.intersperse(phone, 0)
tone = commons.intersperse(tone, 0)
language = commons.intersperse(language, 0)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
bert_ori = get_bert(
norm_text, word2ph, language_str, "cpu", style_text, style_weight
)
del word2ph
assert bert_ori.shape[-1] == len(phone), phone
if language_str == "EN":
en_bert = bert_ori
yue_bert = np.random.randn(1024, len(phone))
elif language_str == "YUE":
en_bert = np.random.randn(1024, len(phone))
yue_bert = bert_ori
else:
raise ValueError("language_str should be EN or YUE")
assert yue_bert.shape[-1] == len(
phone
), f"Bert seq len {yue_bert.shape[-1]} != {len(phone)}"
phone = np.asarray(phone)
tone = np.asarray(tone)
language = np.asarray(language)
en_bert = np.asarray(en_bert.T)
yue_bert = np.asarray(yue_bert.T)
return en_bert, yue_bert, phone, tone, language
en_bert, yue_bert, x, tone, language = get_text("本身我就係一個言出必達嘅人", "YUE")
sid = np.array([0])
print(x, tone, language)
audio = Session(x, tone, language, en_bert, yue_bert, sid)
# export audio
sf.write("output.wav", audio[0][0], 44100)