Skip to content

Commit

Permalink
add vector server, test=doc
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoMax-Xiong committed May 1, 2022
1 parent cdb9a1b commit b1ddddd
Show file tree
Hide file tree
Showing 18 changed files with 735 additions and 11 deletions.
2 changes: 1 addition & 1 deletion demos/speaker_verification/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc
You can choose one way from easy, meduim and hard to install paddlespeech.

### 2. Prepare Input File
The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
The input of this cli demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.

Here are sample files for this demo that can be downloaded:
```bash
Expand Down
6 changes: 3 additions & 3 deletions demos/speaker_verification/README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
## 介绍
声纹识别是一项用计算机程序自动提取说话人特征的技术。

这个 demo 是一个从给定音频文件提取说话人特征,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
这个 demo 是从一个给定音频文件中提取说话人特征,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。

## 使用方法
### 1. 安装
请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)

你可以从 easy,medium,hard 三中方式中选择一种方式安装
你可以从easy medium,hard 三种方式中选择一种方式安装

### 2. 准备输入
这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。
声纹cli demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。

可以下载此 demo 的示例音频:
```bash
Expand Down
9 changes: 7 additions & 2 deletions demos/streaming_asr_server/websocket_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def main(args):
handler = ASRWsAudioHandler(
args.server_ip,
args.port,
endpoint=args.endpoint,
punc_server_ip=args.punc_server_ip,
punc_server_port=args.punc_server_port)
loop = asyncio.get_event_loop()
Expand All @@ -36,7 +37,7 @@ def main(args):
if args.wavfile and os.path.exists(args.wavfile):
logger.info(f"start to process the wavscp: {args.wavfile}")
result = loop.run_until_complete(handler.run(args.wavfile))
result = result["result"]
# result = result["result"]
logger.info(f"asr websocket client finished : {result}")

# support to process batch audios from wav.scp
Expand Down Expand Up @@ -69,7 +70,11 @@ def main(args):
default=8091,
dest="punc_server_port",
help='Punctuation server port')

parser.add_argument(
"--endpoint",
type=str,
default="/paddlespeech/asr/streaming",
help="ASR websocket endpoint")
parser.add_argument(
"--wavfile",
action="store",
Expand Down
3 changes: 2 additions & 1 deletion paddlespeech/cli/vector/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,8 @@ def _init_from_path(self,
model_type: str='ecapatdnn_voxceleb12',
sample_rate: int=16000,
cfg_path: Optional[os.PathLike]=None,
ckpt_path: Optional[os.PathLike]=None):
ckpt_path: Optional[os.PathLike]=None,
task=None):
"""Init the neural network from the model path
Args:
Expand Down
20 changes: 20 additions & 0 deletions paddlespeech/server/README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,23 @@ paddlespeech_server start --config_file conf/tts_online_application.yaml
```
paddlespeech_client tts_online --server_ip 127.0.0.1 --port 8092 --input "您好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```

## 声纹识别

### 启动声纹识别服务

```
paddlespeech_server start --config_file conf/vector_application.yaml
```

### 获取说话人音频声纹

```
paddlespeech_client vector --task spk --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav
```

### 两个说话人音频声纹打分

```
paddlespeech_client vector --task score --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav
```
99 changes: 98 additions & 1 deletion paddlespeech/server/bin/paddlespeech_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

__all__ = [
'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor',
'ASROnlineClientExecutor', 'CLSClientExecutor'
'ASROnlineClientExecutor', 'CLSClientExecutor', 'VectorClientExecutor'
]


Expand Down Expand Up @@ -583,3 +583,100 @@ def __call__(self, input: str, server_ip: str="127.0.0.1", port: int=8090):
response_dict = res.json()
punc_text = response_dict["result"]["punc_text"]
return punc_text


@cli_client_register(
name='paddlespeech_client.vector', description='visit the vector service')
class VectorClientExecutor(BaseExecutor):
def __init__(self):
super(VectorClientExecutor, self).__init__()
self.parser = argparse.ArgumentParser(
prog='paddlespeech_client.vector', add_help=True)
self.parser.add_argument(
'--server_ip', type=str, default='127.0.0.1', help='server ip')
self.parser.add_argument(
'--port', type=int, default=8090, help='server port')
self.parser.add_argument(
'--input',
type=str,
default=None,
help='sentence to be process by text server.')
self.parser.add_argument(
'--task', type=str, default="spk", help="The vector service task")
self.parser.add_argument(
"--enroll", type=str, default=None, help="The enroll audio")
self.parser.add_argument(
"--test", type=str, default=None, help="The test audio")

def execute(self, argv: List[str]) -> bool:
"""Execute the request from the argv.
Args:
argv (List): the request arguments
Returns:
str: the request flag
"""
args = self.parser.parse_args(argv)
input_ = args.input
server_ip = args.server_ip
port = args.port
task = args.task

try:
time_start = time.time()
res = self(
input=input_,
server_ip=server_ip,
port=port,
enroll_audio=args.enroll,
test_audio=args.test,
task=task)
time_end = time.time()
logger.info(f"The vector: {res}")
logger.info("Response time %f s." % (time_end - time_start))
return True
except Exception as e:
logger.error("Failed to extract vector.")
logger.error(e)
return False

@stats_wrapper
def __call__(self,
input: str,
server_ip: str="127.0.0.1",
port: int=8090,
audio_format: str="wav",
sample_rate: int=16000,
enroll_audio: str=None,
test_audio: str=None,
task="spk"):
"""
Python API to call text executor.
Args:
input (str): the request sentence text
server_ip (str, optional): the server ip. Defaults to "127.0.0.1".
port (int, optional): the server port. Defaults to 8090.
Returns:
str: the punctuation text
"""
if task == "spk":
from paddlespeech.server.utils.audio_handler import VectorHttpHandler
logger.info("vector http client start")
logger.info(f"the input audio: {input}")
handler = VectorHttpHandler(server_ip=server_ip, port=port)
res = handler.run(input, audio_format, sample_rate)
return res
elif task == "score":
from paddlespeech.server.utils.audio_handler import VectorScoreHttpHandler
logger.info("vector score http client start")
logger.info(
f"enroll audio: {enroll_audio}, test audio: {test_audio}")
handler = VectorScoreHttpHandler(server_ip=server_ip, port=port)
res = handler.run(enroll_audio, test_audio, audio_format,
sample_rate)
logger.info(f"The vector score is: {res}")
else:
logger.error(f"Sorry, we have not support such task {task}")
13 changes: 12 additions & 1 deletion paddlespeech/server/conf/application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ port: 8090
# protocol = ['websocket', 'http'] (only one can be selected).
# http only support offline engine type.
protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python']
engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']


#################################################################################
Expand Down Expand Up @@ -166,4 +166,15 @@ text_python:
cfg_path: # [optional]
ckpt_path: # [optional]
vocab_file: # [optional]
device: # set 'gpu:id' or 'cpu'


################################### Vector ######################################
################### Vector task: spk; engine_type: python #######################
vector_python:
task: spk
model_type: 'ecapatdnn_voxceleb12'
sample_rate: 16000
cfg_path: # [optional]
ckpt_path: # [optional]
device: # set 'gpu:id' or 'cpu'
32 changes: 32 additions & 0 deletions paddlespeech/server/conf/vector_application.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This is the parameter configuration file for PaddleSpeech Serving.

#################################################################################
# SERVER SETTING #
#################################################################################
host: 0.0.0.0
port: 8090

# The task format in the engin_list is: <speech task>_<engine type>
# protocol = ['http'] (only one can be selected).
# http only support offline engine type.
protocol: 'http'
engine_list: ['vector_python']


#################################################################################
# ENGINE CONFIG #
#################################################################################

################################### Vector ######################################
################### Vector task: spk; engine_type: python #######################
vector_python:
task: spk
model_type: 'ecapatdnn_voxceleb12'
sample_rate: 16000
cfg_path: # [optional]
ckpt_path: # [optional]
device: # set 'gpu:id' or 'cpu'




8 changes: 8 additions & 0 deletions paddlespeech/server/engine/asr/online/asr_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import copy
import os
import time
from typing import Optional

import numpy as np
Expand Down Expand Up @@ -153,6 +154,12 @@ def init(self):
self.n_shift = self.preprocess_conf.process[0]['n_shift']

def extract_feat(self, samples):

# we compute the elapsed time of first char occuring
# and we record the start time at the first pcm sample arraving
# if self.first_char_occur_elapsed is not None:
# self.first_char_occur_elapsed = time.time()

if "deepspeech2online" in self.model_type:
# self.reamined_wav stores all the samples,
# include the original remained_wav and this package samples
Expand Down Expand Up @@ -290,6 +297,7 @@ def reset(self):
self.chunk_num = 0
self.global_frame_offset = 0
self.result_transcripts = ['']
self.first_char_occur_elapsed = None

def decode(self, is_finished=False):
if "deepspeech2online" in self.model_type:
Expand Down
3 changes: 3 additions & 0 deletions paddlespeech/server/engine/engine_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,8 @@ def get_engine(engine_name: Text, engine_type: Text):
elif engine_name.lower() == 'text' and engine_type.lower() == 'python':
from paddlespeech.server.engine.text.python.text_engine import TextEngine
return TextEngine()
elif engine_name.lower() == 'vector' and engine_type.lower() == 'python':
from paddlespeech.server.engine.vector.python.vector_engine import VectorEngine
return VectorEngine()
else:
return None
Empty file.
Empty file.
Loading

0 comments on commit b1ddddd

Please sign in to comment.