Skip to content

Commit

Permalink
Merge pull request #72 from jambonz/feat/verbio_speech
Browse files Browse the repository at this point in the history
add verbio tts/stt
  • Loading branch information
davehorton authored May 28, 2024
2 parents acb2d0c + 2212be3 commit e60a2d2
Show file tree
Hide file tree
Showing 10 changed files with 204 additions and 6 deletions.
1 change: 1 addition & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module.exports = (opts, logger) => {
purgeTtsCache: require('./lib/purge-tts-cache').bind(null, client, logger),
addFileToCache: require('./lib/add-file-to-cache').bind(null, client, logger),
synthAudio: require('./lib/synth-audio').bind(null, client, createHash, retrieveHash, logger),
getVerbioAccessToken: require('./lib/get-verbio-token').bind(null, client, logger),
getNuanceAccessToken: require('./lib/get-nuance-access-token').bind(null, client, logger),
getIbmAccessToken: require('./lib/get-ibm-access-token').bind(null, client, logger),
getAwsAuthToken: require('./lib/get-aws-sts-token').bind(null, logger, createHash, retrieveHash),
Expand Down
3 changes: 3 additions & 0 deletions lib/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module.exports = {
HTTP_TIMEOUT: 5000
};
2 changes: 1 addition & 1 deletion lib/get-ibm-access-token.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded');
const {Pool} = require('undici');
const pool = new Pool('https://iam.cloud.ibm.com');
const {makeIbmKey, noopLogger} = require('./utils');
const { HTTP_TIMEOUT } = require('./constants');
const debug = require('debug')('jambonz:realtimedb-helpers');
const HTTP_TIMEOUT = 5000;

async function getIbmAccessToken(client, logger, apiKey) {
logger = logger || noopLogger;
Expand Down
2 changes: 1 addition & 1 deletion lib/get-nuance-access-token.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ const formurlencoded = require('form-urlencoded');
const {Pool} = require('undici');
const pool = new Pool('https://auth.crt.nuance.com');
const {makeNuanceKey, makeBasicAuthHeader, noopLogger} = require('./utils');
const { HTTP_TIMEOUT } = require('./constants');
const debug = require('debug')('jambonz:realtimedb-helpers');
const HTTP_TIMEOUT = 5000;

async function getNuanceAccessToken(client, logger, clientId, secret, scope) {
logger = logger || noopLogger;
Expand Down
28 changes: 27 additions & 1 deletion lib/get-tts-voices.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
const assert = require('assert');
const {noopLogger, createNuanceClient, createKryptonClient} = require('./utils');
const getNuanceAccessToken = require('./get-nuance-access-token');
const getVerbioAccessToken = require('./get-verbio-token');
const {GetVoicesRequest, Voice} = require('../stubs/nuance/synthesizer_pb');
const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1');
const { IamAuthenticator } = require('ibm-watson/auth');
const ttsGoogle = require('@google-cloud/text-to-speech');
const { PollyClient, DescribeVoicesCommand } = require('@aws-sdk/client-polly');
const getAwsAuthToken = require('./get-aws-sts-token');
const {Pool} = require('undici');
const { HTTP_TIMEOUT } = require('./constants');
const verbioVoicePool = new Pool('https://us.rest.speechcenter.verbio.com');

const getIbmVoices = async(client, logger, credentials) => {
const {tts_region, tts_api_key} = credentials;
Expand Down Expand Up @@ -117,6 +121,26 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential
}
};

const getVerbioVoices = async(client, logger, credentials) => {
try {
const access_token = await getVerbioAccessToken(client, logger, credentials);
const { body} = await verbioVoicePool.request({
path: '/api/v1/voices',
method: 'GET',
headers: {
'Authorization': `Bearer ${access_token.access_token}`,
'User-Agent': 'jambonz'
},
timeout: HTTP_TIMEOUT,
followRedirects: false
});
return await body.json();
} catch (err) {
logger.info({err}, 'getVerbioVoices - failed to list voices for Verbio');
throw err;
}
};

/**
* Synthesize speech to an mp3 file, and also cache the generated speech
* in redis (base64 format) for 24 hours so as to avoid unnecessarily paying
Expand All @@ -136,7 +160,7 @@ const getAwsVoices = async(_client, createHash, retrieveHash, logger, credential
async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, credentials}) {
logger = logger || noopLogger;

assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly'].includes(vendor),
assert.ok(['nuance', 'ibm', 'google', 'aws', 'polly', 'verbio'].includes(vendor),
`getTtsVoices not supported for vendor ${vendor}`);

switch (vendor) {
Expand All @@ -149,6 +173,8 @@ async function getTtsVoices(client, createHash, retrieveHash, logger, {vendor, c
case 'aws':
case 'polly':
return getAwsVoices(client, createHash, retrieveHash, logger, credentials);
case 'verbio':
return getVerbioVoices(client, logger, credentials);
default:
break;
}
Expand Down
51 changes: 51 additions & 0 deletions lib/get-verbio-token.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
const {Pool} = require('undici');
const { noopLogger, makeVerbioKey } = require('./utils');
const { HTTP_TIMEOUT } = require('./constants');
const pool = new Pool('https://auth.speechcenter.verbio.com:444');
const debug = require('debug')('jambonz:realtimedb-helpers');

async function getVerbioAccessToken(client, logger, credentials) {
logger = logger || noopLogger;
const { client_id, client_secret } = credentials;
try {
const key = makeVerbioKey(client_id);
const access_token = await client.get(key);
if (access_token) {
return {access_token, servedFromCache: true};
}

const payload = {
client_id,
client_secret
};

const {statusCode, headers, body} = await pool.request({
path: '/api/v1/token',
method: 'POST',
headers: {
'Content-Type': 'application/json',
'User-Agent': 'jambonz'
},
body: JSON.stringify(payload),
timeout: HTTP_TIMEOUT,
followRedirects: false
});

if (200 !== statusCode) {
logger.debug({statusCode, headers, body: await body.text()}, 'error fetching access token from Verbio');
const err = new Error();
err.statusCode = statusCode;
throw err;
}
const json = await body.json();
const expiry = Math.floor(json.expiration_time - Date.now() / 1000 - 30);
await client.set(key, json.access_token, 'EX', expiry);
return {...json, servedFromCache: false};
} catch (err) {
debug(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`);
logger.error(err, `getVerbioAccessToken: Error retrieving Verbio access token for client_id ${client_id}`);
throw err;
}
}

module.exports = getVerbioAccessToken;
54 changes: 52 additions & 2 deletions lib/synth-audio.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const {
noopLogger
} = require('./utils');
const getNuanceAccessToken = require('./get-nuance-access-token');
const getVerbioAccessToken = require('./get-verbio-token');
const {
SynthesisRequest,
Voice,
Expand Down Expand Up @@ -86,7 +87,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
logger = logger || noopLogger;

assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
'whisper', 'deepgram', 'playht', 'rimelabs'].includes(vendor) ||
'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio'].includes(vendor) ||
vendor.startsWith('custom'),
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
if ('google' === vendor) {
Expand Down Expand Up @@ -139,6 +140,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used');
} else if (vendor.startsWith('custom')) {
assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
} else if ('verbio' === vendor) {
assert.ok(voice, 'synthAudio requires voice when verbio is used');
assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used');
assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used');
}
const key = makeSynthKey({
account_sid,
Expand All @@ -149,7 +154,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
text
});
let filePath;
if (['nuance', 'nvidia'].includes(vendor) ||
if (['nuance', 'nvidia', 'verbio'].includes(vendor) ||
(
(process.env.JAMBONES_TTS_TRIM_SILENCE || !process.env.JAMBONES_DISABLE_TTS_STREAMING) &&
['microsoft', 'azure'].includes(vendor)
Expand Down Expand Up @@ -234,6 +239,11 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'verbio':
audioBuffer = await synthVerbio(client, logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'deepgram':
audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
renderForCaching, disableTtsStreaming});
Expand Down Expand Up @@ -819,6 +829,46 @@ const synthRimelabs = async(logger, {
throw err;
}
};
const synthVerbio = async(client, logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
//https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
if (text.length > 2000) {
throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters');
}
const token = await getVerbioAccessToken(client, logger, credentials);
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = '';
params += `{access_token=${token.access_token}`;
params += ',vendor=verbio';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += '}';

return {
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
servedFromCache: false,
rtt: 0
};
}

try {
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', {
'Authorization': `Bearer ${token.access_token}`,
'User-Agent': 'jambonz',
'Content-Type': 'application/json'
});
const r8 = await post('/api/v1/synthesize', {
voice_id: voice,
output_sample_rate: '8k',
output_encoding: 'pcm16',
text
});
return r8;
} catch (err) {
logger.info({err}, 'synth Verbio returned error');
stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
throw err;
}
};

const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials;
Expand Down
9 changes: 8 additions & 1 deletion lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ const {SynthesizerClient} = require('../stubs/nuance/synthesizer_grpc_pb');
const {RivaSpeechSynthesisClient} = require('../stubs/riva/proto/riva_tts_grpc_pb');
const {Pool} = require('undici');
const pool = new Pool('https://auth.crt.nuance.com');
const HTTP_TIMEOUT = 5000;
const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
const grpc = require('@grpc/grpc-js');
const formurlencoded = require('form-urlencoded');
const { HTTP_TIMEOUT } = require('./constants');

const debug = require('debug')('jambonz:realtimedb-helpers');
/**
Expand Down Expand Up @@ -49,6 +49,12 @@ function makeAwsKey(awsAccessKeyId) {
return `aws:${hash.digest('hex')}`;
}

function makeVerbioKey(client_id) {
const hash = crypto.createHash('sha1');
hash.update(client_id);
return `verbio:${hash.digest('hex')}`;
}

function makeNuanceKey(clientId, secret, scope) {
const hash = crypto.createHash('sha1');
hash.update(`${clientId}:${secret}:${scope}`);
Expand Down Expand Up @@ -117,6 +123,7 @@ module.exports = {
makeNuanceKey,
makeIbmKey,
makeAwsKey,
makeVerbioKey,
getNuanceAccessToken,
createNuanceClient,
createKryptonClient,
Expand Down
26 changes: 26 additions & 0 deletions test/list-voices.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,32 @@ const stats = {
histogram: () => {}
};

test('Verbio - get Access key and voices', async(t) => {
const fn = require('..');
const {client, getTtsVoices, getVerbioAccessToken} = fn(opts, logger);
if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) {
t.pass('skipping Verbio test since no Verbio Keys provided');
t.end();
client.quit();
return;
}

try {
const credentials = {
client_id: process.env.VERBIO_CLIENT_ID,
client_secret: process.env.VERBIO_CLIENT_SECRET
};
let obj = await getVerbioAccessToken(credentials);
t.ok(obj.access_token , 'successfully received access token not from cache');
const voices = await getTtsVoices({vendor: 'verbio', credentials});
t.ok(voices && voices.length != 0, 'successfully received verbio voices');
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});

test('IBM - create access key', async(t) => {
const fn = require('..');
const {client, getIbmAccessToken} = fn(opts, logger);
Expand Down
34 changes: 34 additions & 0 deletions test/synth.js
Original file line number Diff line number Diff line change
Expand Up @@ -670,6 +670,40 @@ test('whisper speech synth tests', async(t) => {
language: 'en-US',
voice: 'alloy',
text,
renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);

} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
client.quit();
});

test('Verbio speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);

if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) {
t.pass('skipping Verbio Synthesize test since no Verbio Keys provided');
t.end();
client.quit();
return;
}

const text = 'Hi there and welcome to jambones!';
try {
let opts = await synthAudio(stats, {
vendor: 'verbio',
credentials: {
client_id: process.env.VERBIO_CLIENT_ID,
client_secret: process.env.VERBIO_CLIENT_SECRET
},
language: 'en-US',
voice: 'tommy_en-us',
text,
renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);

Expand Down

0 comments on commit e60a2d2

Please sign in to comment.