-
Notifications
You must be signed in to change notification settings - Fork 511
How to scale TTS output duration to a given length
szhaomsft edited this page Oct 22, 2020
·
1 revision
In some scenario, you might want to have controlled duration of TTS. For example, if you want to add voice dubbing with TTS. The audio needs to sync with the video. Here is one way to do it.
Basic idea is to generate TTS with default rate. Then you can calculate the ratio to expected duration, then you can adjust the rate for the TTS output with SSML prosody rate.
public static async Task SythensizeTextByDuration(string voice, string locale, string text, double duration)
{
var config = SpeechConfig.FromSubscription(CogSvcKey.SpeechKey, CogSvcKey.SpeechRegion);
string file = "temp.wav";
config.SpeechSynthesisLanguage = locale;
config.SpeechSynthesisVoiceName = voice;
string ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'>{text}</voice></speak>";
using (var fileOutput = AudioConfig.FromWavFileOutput(file))
{
using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
{
using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
{
if (result.Reason == ResultReason.SynthesizingAudioCompleted)
{
}
else if (result.Reason == ResultReason.Canceled)
{
var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
}
}
}
fileOutput.Dispose();
}
WaveFileReader reader = new WaveFileReader(file);
TimeSpan span = reader.TotalTime;
reader.Close();
double ratioAdustPercentage = (span.TotalMilliseconds / 1000 / duration - 1.0f) * 100;
ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'><prosody rate='{ratioAdustPercentage}%'>{text}</prosody></voice></speak>";
using (var fileOutput = AudioConfig.FromWavFileOutput(file + $"-{duration}.wav"))
{
using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
{
using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
{
if (result.Reason == ResultReason.SynthesizingAudioCompleted)
{
}
else if (result.Reason == ResultReason.Canceled)
{
var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");
if (cancellation.Reason == CancellationReason.Error)
{
Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
Console.WriteLine($"CANCELED: Did you update the subscription info?");
}
}
}
}
fileOutput.Dispose();
}
}
- Azure TTS: Empower every person and every organization on the planet to have a delightful digital voice!
- Azure Custom Voice: Build your one-of-a-kind Custom Voice and close to human Neural TTS in cloud and edge!