add synthesize verbio

jambonz · May 20, 2024 · 2212be3 · 2212be3
1 parent 5d2d921
commit 2212be3
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 5 deletions.
diff --git a/lib/synth-audio.js b/lib/synth-audio.js
@@ -22,6 +22,7 @@ const {
   noopLogger
 } = require('./utils');
 const getNuanceAccessToken = require('./get-nuance-access-token');
+const getVerbioAccessToken = require('./get-verbio-token');
 const {
   SynthesisRequest,
   Voice,
@@ -86,7 +87,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
   logger = logger || noopLogger;
 
   assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
-    'whisper', 'deepgram', 'playht', 'rimelabs'].includes(vendor) ||
+    'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio'].includes(vendor) ||
   vendor.startsWith('custom'),
   `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
   if ('google' === vendor) {
@@ -139,6 +140,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
     assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used');
   } else  if (vendor.startsWith('custom')) {
     assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
+  } else if ('verbio' === vendor) {
+    assert.ok(voice, 'synthAudio requires voice when verbio is used');
+    assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used');
+    assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used');
   }
   const key = makeSynthKey({
     account_sid,
@@ -149,7 +154,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
     text
   });
   let filePath;
-  if (['nuance', 'nvidia'].includes(vendor) ||
+  if (['nuance', 'nvidia', 'verbio'].includes(vendor) ||
     (
       (process.env.JAMBONES_TTS_TRIM_SILENCE || !process.env.JAMBONES_DISABLE_TTS_STREAMING) &&
       ['microsoft', 'azure'].includes(vendor)
@@ -234,6 +239,11 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
           credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
         if (audioBuffer?.filePath) return audioBuffer;
         break;
+      case 'verbio':
+        audioBuffer = await synthVerbio(client, logger, {
+          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
+        if (audioBuffer?.filePath) return audioBuffer;
+        break;
       case 'deepgram':
         audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
           renderForCaching, disableTtsStreaming});
@@ -819,6 +829,46 @@ const synthRimelabs = async(logger, {
     throw err;
   }
 };
+const synthVerbio = async(client, logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
+  //https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
+  if (text.length > 2000) {
+    throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters');
+  }
+  const token = await getVerbioAccessToken(client, logger, credentials);
+  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+    let params = '';
+    params += `{access_token=${token.access_token}`;
+    params += ',vendor=verbio';
+    params += `,voice=${voice}`;
+    params += ',write_cache_file=1';
+    params += '}';
+
+    return {
+      filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
+      servedFromCache: false,
+      rtt: 0
+    };
+  }
+
+  try {
+    const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', {
+      'Authorization': `Bearer ${token.access_token}`,
+      'User-Agent': 'jambonz',
+      'Content-Type': 'application/json'
+    });
+    const r8 = await post('/api/v1/synthesize', {
+      voice_id: voice,
+      output_sample_rate: '8k',
+      output_encoding: 'pcm16',
+      text
+    });
+    return r8;
+  } catch (err) {
+    logger.info({err}, 'synth Verbio returned error');
+    stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
+    throw err;
+  }
+};
 
 const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
   const {api_key, model_id, baseURL, timeout, speed} = credentials;

diff --git a/test/list-voices.js b/test/list-voices.js
@@ -28,9 +28,7 @@ test('Verbio - get Access key and voices', async(t) => {
       client_secret: process.env.VERBIO_CLIENT_SECRET
     };
     let obj = await getVerbioAccessToken(credentials);
-    t.ok(obj.access_token && !obj.servedFromCache, 'successfully received access token not from cache');
-    obj = await getVerbioAccessToken(credentials);
-    t.ok(obj.access_token && obj.servedFromCache, 'successfully received access token from cache');
+    t.ok(obj.access_token , 'successfully received access token not from cache');
     const voices = await getTtsVoices({vendor: 'verbio', credentials});
     t.ok(voices && voices.length != 0, 'successfully received verbio voices');
   } catch (err) {

diff --git a/test/synth.js b/test/synth.js
@@ -670,6 +670,40 @@ test('whisper speech synth tests', async(t) => {
       language: 'en-US',
       voice: 'alloy',
       text,
+      renderForCaching: true
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);
+
+  } catch (err) {
+    console.error(JSON.stringify(err));
+    t.end(err);
+  }
+  client.quit();
+});
+
+test('Verbio speech synth tests', async(t) => {
+  const fn = require('..');
+  const {synthAudio, client} = fn(opts, logger);
+
+  if (!process.env.VERBIO_CLIENT_ID || !process.env.VERBIO_CLIENT_SECRET) {
+    t.pass('skipping Verbio Synthesize test since no Verbio Keys provided');
+    t.end();
+    client.quit();
+    return;
+  }
+
+  const text = 'Hi there and welcome to jambones!';
+  try {
+    let opts = await synthAudio(stats, {
+      vendor: 'verbio',
+      credentials: {
+        client_id: process.env.VERBIO_CLIENT_ID,
+        client_secret: process.env.VERBIO_CLIENT_SECRET
+      },
+      language: 'en-US',
+      voice: 'tommy_en-us',
+      text,
+      renderForCaching: true
     });
     t.ok(!opts.servedFromCache, `successfully synthesized whisper audio to ${opts.filePath}`);