Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[demos] use new engine api for speech_web #2080

Merged
merged 2 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 8 additions & 55 deletions demos/speech_web/README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ PaddleSpeechDemo是一个以PaddleSpeech的语音交互功能为主体开发的D
# 安装环境
cd speech_server
pip install -r requirements.txt

# 下载 ie 模型,针对地点进行微调,效果更好,不下载的话会使用其它版本,效果没有这个好
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uie?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

information_extraction,这个模型是针对地点微调过的,更好一些

cd source
mkdir model
cd model
wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams
```


Expand Down Expand Up @@ -61,59 +67,6 @@ yarn dev --port 8011
```

默认配置下,前端中配置的后台地址信息是localhost,确保后端服务器和打开页面的游览器在同一台机器上,不在一台机器的配置方式见下方的FAQ:【后端如果部署在其它机器或者别的端口如何修改】

## Docker启动

### 后端docker
后端docker使用[paddlepaddle官方docker](https://www.paddlepaddle.org.cn),这里演示CPU版本
```
# 拉取PaddleSpeech项目
cd PaddleSpeechServer
git clone https://github.com/PaddlePaddle/PaddleSpeech.git

# 拉取镜像
docker pull registry.baidubce.com/paddlepaddle/paddle:2.3.0

# 启动容器
docker run --name paddle -it -p 8010:8010 -v $PWD:/paddle registry.baidubce.com/paddlepaddle/paddle:2.3.0 /bin/bash

# 进入容器
cd /paddle

# 安装依赖
pip install -r requirements

# 启动服务
python main --port 8010

```

### 前端docker

前端docker直接使用[node官方的docker](https://hub.docker.com/_/node)即可

```shell
docker pull node
```

镜像中安装依赖

```shell
cd PaddleSpeechWebClient
# 映射外部8011端口
docker run -it -p 8011:8011 -v $PWD:/paddle node:latest bin/bash
# 进入容器中
cd /paddle
# 安装依赖
yarn install
# 启动前端
yarn dev --port 8011
```





## FAQ

#### Q: 如何安装node.js
Expand All @@ -126,7 +79,7 @@ A:后端的配置地址有分散在两个文件中

修改第一个文件`PaddleSpeechWebClient/vite.config.js`

```json
```
server: {
host: "0.0.0.0",
proxy: {
Expand All @@ -141,7 +94,7 @@ server: {

修改第二个文件`PaddleSpeechWebClient/src/api/API.js`(Websocket代理配置失败,所以需要在这个文件中修改)

```javascript
```
// websocket (这里改成后端所在的接口)
CHAT_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/offlineStream', // ChatBot websocket 接口
ASR_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/onlineStream', // Stream ASR 接口
Expand Down
2 changes: 1 addition & 1 deletion demos/speech_web/speech_server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from src.WebsocketManeger import ConnectionManager
from src.SpeechBase.vpr import VPR

from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.utils.audio_process import float2pcm


Expand Down
23 changes: 0 additions & 23 deletions demos/speech_web/speech_server/src/AudioManeger.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,29 +145,6 @@ def stop(self):

def resume(self):
self.is_pause = False


if __name__ == '__main__':
from robot import Robot

chatbot = Robot()
chatbot.init()
audio_manger = AudioMannger(chatbot)

file_list = [
"source/20220418145230qbenc.pcm",
]

for file in file_list:
with open(file, "rb") as f:
pcm_bin = f.read()
print(len(pcm_bin))
asr_ = audio_manger.stream_asr(pcm_bin=pcm_bin)
print(asr_)

print(audio_manger.end())

print(chatbot.speech2text("source/20220418145230zrxia.wav"))



29 changes: 2 additions & 27 deletions demos/speech_web/speech_server/src/SpeechBase/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
import librosa
import soundfile

from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.engine.asr.online.python.asr_engine import ASREngine
from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.utils.config import get_config

def readWave(samples):
Expand Down Expand Up @@ -59,29 +59,4 @@ def onlineASR(self, samples:bytes=None, is_finished=False):
self.connection_handler.reset()
return asr_results


if __name__ == '__main__':
config_path = r"../../PaddleSpeech/paddlespeech/server/conf/ws_conformer_application.yaml"

wav_path = r"../../source/demo/demo_16k.wav"
samples, sample_rate = soundfile.read(wav_path, dtype='int16')

asr = ASR(config_path=config_path)
end_result = asr.offlineASR(samples=samples, sample_rate=sample_rate)
print("端到端识别结果:", end_result)

for sub_wav in readWave(samples=samples):
# print(sub_wav)
message = sub_wav.tobytes()
offline_result = asr.onlineASR(message, is_finished=False)
print("流式识别结果: ", offline_result)
offline_result = asr.onlineASR(is_finished=True)
print("流式识别结果: ", offline_result)








5 changes: 0 additions & 5 deletions demos/speech_web/speech_server/src/SpeechBase/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,4 @@ def ie(self, text):
result = self.ie_model(text)
return result

if __name__ == '__main__':
ie_model_path = "../../source/model/"
nlp = NLP(ie_model_path=ie_model_path)
text = "今天早上我从大牛坊去百度科技园花了七百块钱"
print(nlp.ie(text))

36 changes: 0 additions & 36 deletions demos/speech_web/speech_server/src/SpeechBase/sql_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,40 +113,4 @@ def decode_vector(self, vector_base64, dtype=np.float32):
b = base64.b64decode(vector_base64)
vc = np.frombuffer(b, dtype=dtype)
return vc

if __name__ == '__main__':
db_path = "../../source/db/vpr.sqlite"
db = DataBase(db_path)

# 准备数据
import numpy as np
vector = np.random.randn((192)).astype(np.float32).tobytes()
vector_base64 = base64.b64encode(vector).decode('utf8')
username = "sss"
wav_path = r"../../source/demo/demo_16k.wav"

# 插入数据
db.insert_one(username, vector_base64, wav_path)

# 查询数据
res_all = db.select_all()
print("res_all: ", res_all)

s_id = res_all[0]['id']
res_id = db.select_by_id(s_id)
print("res_id: ", res_id)

res_uername = db.select_by_username(username)
print("res_username: ", res_uername)

# base64还原
b = base64.b64decode(res_uername[0]['vector'])
vc = np.frombuffer(b, dtype=np.float32)
print(vc)

# 删除数据
db.drop_by_username(username)
res_all = db.select_all()
print("删除后 res_all: ", res_all)
db.drop_all()

120 changes: 104 additions & 16 deletions demos/speech_web/speech_server/src/SpeechBase/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
# 4. 流式推理

import base64

import math
import logging
import numpy as np
from paddlespeech.server.utils.onnx_infer import get_sess
from paddlespeech.t2s.frontend.zh_frontend import Frontend
Expand All @@ -17,14 +18,14 @@

from paddlespeech.server.engine.tts.online.onnx.tts_engine import TTSEngine


class TTS:
def __init__(self, config_path):
self.config = get_config(config_path)['tts_online-onnx']
self.config['voc_block'] = 36
self.engine = TTSEngine()
self.engine = TTSEngine()
self.engine.init(self.config)
self.engine.warm_up()
self.executor = self.engine.executor
#self.engine.warm_up()

# 前端初始化
self.frontend = Frontend(
Expand Down Expand Up @@ -81,8 +82,105 @@ def offlineTTS(self, text):
return wavs

def streamTTS(self, text):
for sub_wav_base64 in self.engine.run(sentence=text):
yield sub_wav_base64

get_tone_ids = False
merge_sentences = False

# front
input_ids = self.frontend.get_input_ids(
text,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]

for i in range(len(phone_ids)):
part_phone_ids = phone_ids[i].numpy()
voc_chunk_id = 0

# fastspeech2_csmsc
if self.config.am == "fastspeech2_csmsc_onnx":
# am
mel = self.executor.am_sess.run(
output_names=None, input_feed={'text': part_phone_ids})
mel = mel[0]

# voc streaming
mel_chunks = get_chunks(mel, self.config.voc_block, self.config.voc_pad, "voc")
voc_chunk_num = len(mel_chunks)
for i, mel_chunk in enumerate(mel_chunks):
sub_wav = self.executor.voc_sess.run(
output_names=None, input_feed={'logmel': mel_chunk})
sub_wav = self.depadding(sub_wav[0], voc_chunk_num, i,
self.config.voc_block, self.config.voc_pad,
self.config.voc_upsample)

yield self.after_process(sub_wav)

# fastspeech2_cnndecoder_csmsc
elif self.config.am == "fastspeech2_cnndecoder_csmsc_onnx":
# am
orig_hs = self.executor.am_encoder_infer_sess.run(
None, input_feed={'text': part_phone_ids})
orig_hs = orig_hs[0]

# streaming voc chunk info
mel_len = orig_hs.shape[1]
voc_chunk_num = math.ceil(mel_len / self.config.voc_block)
start = 0
end = min(self.config.voc_block + self.config.voc_pad, mel_len)

# streaming am
hss = get_chunks(orig_hs, self.config.am_block, self.config.am_pad, "am")
am_chunk_num = len(hss)
for i, hs in enumerate(hss):
am_decoder_output = self.executor.am_decoder_sess.run(
None, input_feed={'xs': hs})
am_postnet_output = self.executor.am_postnet_sess.run(
None,
input_feed={
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
})
am_output_data = am_decoder_output + np.transpose(
am_postnet_output[0], (0, 2, 1))
normalized_mel = am_output_data[0][0]

sub_mel = denorm(normalized_mel, self.executor.am_mu,
self.executor.am_std)
sub_mel = self.depadding(sub_mel, am_chunk_num, i,
self.config.am_block, self.config.am_pad, 1)

if i == 0:
mel_streaming = sub_mel
else:
mel_streaming = np.concatenate(
(mel_streaming, sub_mel), axis=0)

# streaming voc
# 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理
while (mel_streaming.shape[0] >= end and
voc_chunk_id < voc_chunk_num):
voc_chunk = mel_streaming[start:end, :]

sub_wav = self.executor.voc_sess.run(
output_names=None, input_feed={'logmel': voc_chunk})
sub_wav = self.depadding(
sub_wav[0], voc_chunk_num, voc_chunk_id,
self.config.voc_block, self.config.voc_pad, self.config.voc_upsample)

yield self.after_process(sub_wav)

voc_chunk_id += 1
start = max(
0, voc_chunk_id * self.config.voc_block - self.config.voc_pad)
end = min(
(voc_chunk_id + 1) * self.config.voc_block + self.config.voc_pad,
mel_len)

else:
logging.error(
"Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts."
)


def streamTTSBytes(self, text):
for wav in self.engine.executor.infer(
Expand All @@ -106,16 +204,6 @@ def streamTTS_TVM(self, text):
# 用 TVM 优化
pass

if __name__ == '__main__':
text = "啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈啊哈哈哈哈哈哈"
config_path="../../PaddleSpeech/demos/streaming_tts_server/conf/tts_online_application.yaml"
tts = TTS(config_path)

for sub_wav in tts.streamTTS(text):
print("sub_wav_base64: ", len(sub_wav))

end_wav = tts.offlineTTS(text)
print(end_wav)



Loading