diff --git a/examples/text_to_speech_synthesizer/waveglow_handler.py b/examples/text_to_speech_synthesizer/waveglow_handler.py index 9de00760bee..f4208959588 100644 --- a/examples/text_to_speech_synthesizer/waveglow_handler.py +++ b/examples/text_to_speech_synthesizer/waveglow_handler.py @@ -91,7 +91,8 @@ def preprocess(self, data): def inference(self, data): with torch.no_grad(): - _, mel, _, _ = self.tacotron2_model.infer(data) + input_lengths = torch.IntTensor([data.size(1)]).to(device=self.device, dtype=torch.int64) + mel, _, _ = self.tacotron2_model.infer(data, input_lengths) audio = self.waveglow_model.infer(mel) return audio