Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Nvidia Waveglow/Tacotron2 example #1905

Merged
merged 3 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions examples/text_to_speech_synthesizer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

**This example works only on NVIDIA CUDA device and not on CPU**

We have used the following Waveglow/Tacotron2 model for this example:
We have used the following Waveglow/Tacotron2 model for this example:

https://pytorch.org/hub/nvidia_deeplearningexamples_waveglow/

Expand All @@ -20,40 +20,40 @@ pip install librosa --user
# Serve the WaveGlow speech synthesis model on TorchServe

* Generate the model archive for waveglow speech synthesis model using following command

```bash
python create_mar.py
./create_mar.sh
```

* Register the model on TorchServe using the above model archive file

```bash
mkdir model_store
mv waveglow_synthesizer.mar model_store/
torchserve --start --model-store model_store --models waveglow_synthesizer.mar
```
* Run inference and download audio output using curl command :
* Run inference and download audio output using curl command :
```bash
curl http://127.0.0.1:8080/predictions/waveglow_synthesizer -T sample_text.txt -o audio.wav
```

* Run inference and download audio output using python script :

```python
import requests

files = {'data': open('sample_text.txt','rb')}
response = requests.post('http://localhost:8080/predictions/waveglow_synthesizer', files=files)
data = response.content

with open('audio.wav', 'wb') as audio_file:
audio_file.write(data)
```

* Change the host and port in above samples as per your server configuration.

* Response :
An audio.wav file gets downloaded.

**Note :** The above example works only for smaller text size. Refer following NVidia/DeepLearningExamples ticket for more details :
https://github.com/NVIDIA/DeepLearningExamples/issues/497
29 changes: 0 additions & 29 deletions examples/text_to_speech_synthesizer/create_mar.py

This file was deleted.

59 changes: 37 additions & 22 deletions examples/text_to_speech_synthesizer/waveglow_handler.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import logging
import numpy as np
import os
import torch
import uuid
import zipfile

import numpy as np
import torch
from scipy.io.wavfile import write
from waveglow_model import WaveGlow
from scipy.io.wavfile import write, read

from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)


class WaveGlowSpeechSynthesizer(BaseHandler):

def __init__(self):
self.waveglow_model = None
self.tacotron2_model = None
Expand All @@ -31,16 +32,21 @@ def _unwrap_distributed(self, state_dict):
"""
new_state_dict = {}
for key, value in state_dict.items():
new_key = key.replace('module.', '')
new_key = key.replace("module.", "")
new_state_dict[new_key] = value
return new_state_dict

def _load_tacotron2_model(self, model_dir):
from PyTorch.SpeechSynthesis.Tacotron2.tacotron2 import model as tacotron2
from PyTorch.SpeechSynthesis.Tacotron2.tacotron2.text import text_to_sequence
tacotron2_checkpoint = torch.load(os.path.join(model_dir, 'nvidia_tacotron2pyt_fp32_20190427.pth'))
tacotron2_state_dict = self._unwrap_distributed(tacotron2_checkpoint['state_dict'])
tacotron2_config = tacotron2_checkpoint['config']

tacotron2_checkpoint = torch.load(
os.path.join(model_dir, "nvidia_tacotron2pyt_fp32_20190427.pth")
)
tacotron2_state_dict = self._unwrap_distributed(
tacotron2_checkpoint["state_dict"]
)
tacotron2_config = tacotron2_checkpoint["config"]
self.tacotron2_model = tacotron2.Tacotron2(**tacotron2_config)
self.tacotron2_model.load_state_dict(tacotron2_state_dict)
self.tacotron2_model.text_to_sequence = text_to_sequence
Expand All @@ -51,16 +57,20 @@ def initialize(self, ctx):

properties = ctx.system_properties
model_dir = properties.get("model_dir")
if not torch.cuda.is_available() or properties.get("gpu_id") is None :
if not torch.cuda.is_available() or properties.get("gpu_id") is None:
raise RuntimeError("This model is not supported on CPU machines.")
self.device = torch.device("cuda:" + str(properties.get("gpu_id")))

with zipfile.ZipFile(model_dir + '/tacotron.zip', 'r') as zip_ref:
with zipfile.ZipFile(model_dir + "/tacotron.zip", "r") as zip_ref:
zip_ref.extractall(model_dir)

waveglow_checkpoint = torch.load(os.path.join(model_dir, "nvidia_waveglowpyt_fp32_20190427.pth"))
waveglow_state_dict = self._unwrap_distributed(waveglow_checkpoint['state_dict'])
waveglow_config = waveglow_checkpoint['config']
waveglow_checkpoint = torch.load(
os.path.join(model_dir, "nvidia_waveglowpyt_fp32_20190427.pth")
)
waveglow_state_dict = self._unwrap_distributed(
waveglow_checkpoint["state_dict"]
)
waveglow_config = waveglow_checkpoint["config"]
self.waveglow_model = WaveGlow(**waveglow_config)
self.waveglow_model.load_state_dict(waveglow_state_dict)
self.waveglow_model = self.waveglow_model.remove_weightnorm(self.waveglow_model)
Expand All @@ -69,29 +79,34 @@ def initialize(self, ctx):

self._load_tacotron2_model(model_dir)

logger.debug('WaveGlow model file loaded successfully')
logger.debug("WaveGlow model file loaded successfully")
self.initialized = True

def preprocess(self, data):
"""
converts text to sequence of IDs using tacatron2 text_to_sequence
with english cleaners to transform text and standardize input
(ex: lowercasing, expanding abbreviations and numbers, etc.)
returns an Numpy array
converts text to sequence of IDs using tacatron2 text_to_sequence
with english cleaners to transform text and standardize input
(ex: lowercasing, expanding abbreviations and numbers, etc.)
returns an Numpy array
"""
text = data[0].get("data")
if text is None:
text = data[0].get("body")
text = text.decode('utf-8')
text = text.decode("utf-8")

sequence = np.array(self.tacotron2_model.text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = np.array(
self.tacotron2_model.text_to_sequence(text, ["english_cleaners"])
)[None, :]
sequence = torch.from_numpy(sequence).to(device=self.device, dtype=torch.int64)

return sequence

def inference(self, data):
with torch.no_grad():
_, mel, _, _ = self.tacotron2_model.infer(data)
input_lengths = torch.IntTensor([data.size(1)]).to(
device=self.device, dtype=torch.int64
)
mel, _, _ = self.tacotron2_model.infer(data, input_lengths)
audio = self.waveglow_model.infer(mel)

return audio
Expand All @@ -100,7 +115,7 @@ def postprocess(self, inference_output):
audio_numpy = inference_output[0].data.cpu().numpy()
path = "/tmp/{}.wav".format(uuid.uuid4().hex)
write(path, 22050, audio_numpy)
with open(path, 'rb') as output:
with open(path, "rb") as output:
data = output.read()
os.remove(path)
return [data]