Merge pull request #97 from jambonz/feat/google_voice_cloning

support google voice cloning
jambonz · Oct 31, 2024 · 63efecf · 63efecf
2 parents f183852 + 153ac3f
commit 63efecf
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 18 deletions.
diff --git a/lib/synth-audio.js b/lib/synth-audio.js
@@ -170,7 +170,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
     renderForCaching
   });
   let filePath;
-  filePath = makeFilePath({vendor, key, salt, renderForCaching});
+  filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
   debug(`synth key is ${key}`);
   let cached;
   if (!disableTtsCache) {
@@ -192,7 +192,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
       cached = await client.get(preCachekey);
       if (cached) {
         // Precache audio is available update filpath with precache file extension.
-        filePath = makeFilePath({vendor, key, salt, renderForCaching: true});
+        filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
       }
     }
   }
@@ -353,6 +353,44 @@ const synthPolly = async(createHash, retrieveHash, logger,
 
 const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
   const client = new ttsGoogle.TextToSpeechClient(credentials);
+  // If google custom voice cloning is used.
+  // At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
+  if (typeof voice === 'object' && voice.voice_cloning_key) {
+    try {
+      const accessToken = await client.auth.getAccessToken();
+      const projectId = await client.getProjectId();
+
+      const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
+        'Authorization': `Bearer ${accessToken}`,
+        'x-goog-user-project': projectId,
+        'Content-Type': 'application/json; charset=utf-8'
+      });
+
+      const payload = {
+        input: {
+          text
+        },
+        voice: {
+          language_code: language,
+          voice_clone: {
+            voice_cloning_key: voice.voice_cloning_key
+          }
+        },
+        audioConfig: {
+          // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
+          audioEncoding: 'LINEAR16',
+          sample_rate_hertz: 24000
+        }
+      };
+
+      const wav = await post('/v1beta1/text:synthesize', payload);
+      return Buffer.from(wav.audioContent, 'base64');
+    } catch (err) {
+      logger.info({err: await err.text()}, 'synthGoogle returned error');
+      throw err;
+    }
+  }
+
   const opts = {
     voice: {
       ...(typeof voice === 'string' && {name: voice}),

diff --git a/lib/utils.js b/lib/utils.js
@@ -23,19 +23,20 @@ function makeSynthKey({
   hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
   const hexHashKey = hash.digest('hex');
   const accountKey = account_sid ? `:${account_sid}` : '';
-  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, renderForCaching});
+  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
   const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
   return key;
 }
 
-function makeFilePath({vendor, key, salt = '', renderForCaching = false}) {
-  const extension = getFileExtension({vendor, renderForCaching});
+function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
+  const extension = getFileExtension({vendor, renderForCaching, voice});
   return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
 }
 
-function getFileExtension({vendor, renderForCaching = false}) {
+function getFileExtension({vendor, voice, renderForCaching = false}) {
   const mp3Extension = 'mp3';
   const r8Extension = 'r8';
+  const wavExtension = 'wav';
 
   switch (vendor) {
     case 'azure':
@@ -58,6 +59,13 @@ function getFileExtension({vendor, renderForCaching = false}) {
     case 'nvidia':
     case 'verbio':
       return r8Extension;
+    case 'google':
+      // google voice cloning just support wav.
+      if (typeof voice === 'object' && voice.voice_cloning_key) {
+        return wavExtension;
+      } else {
+        return mp3Extension;
+      }
     default:
       // If vendor is custom
       if (vendor.startsWith('custom')) {

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -28,7 +28,7 @@
   "dependencies": {
     "@aws-sdk/client-polly": "^3.496.0",
     "@aws-sdk/client-sts": "^3.496.0",
-    "@google-cloud/text-to-speech": "^5.0.2",
+    "@google-cloud/text-to-speech": "^5.5.0",
     "@grpc/grpc-js": "^1.9.14",
     "@jambonz/realtimedb-helpers": "^0.8.7",
     "bent": "^7.3.12",

diff --git a/test/synth.js b/test/synth.js
@@ -91,14 +91,17 @@ test('Google speech Custom voice synth tests', async(t) => {
   const fn = require('..');
   const {synthAudio, client} = fn(opts, logger);
 
-  if (!process.env.GCP_CUSTOM_VOICE_FILE && !process.env.GCP_CUSTOM_VOICE_JSON_KEY || !process.env.GCP_CUSTOM_VOICE_MODEL) {
-    t.pass('skipping google speech synth tests since neither GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided');
+  if (!process.env.GCP_CUSTOM_VOICE_FILE &&
+    !process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
+    !process.env.GCP_CUSTOM_VOICE_MODEL) {
+    t.pass(`skipping google speech synth tests since neither 
+GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided`);
     return t.end();
   }
   try {
     const str = process.env.GCP_CUSTOM_VOICE_JSON_KEY || fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
     const creds = JSON.parse(str);
-    let opts = await synthAudio(stats, {
+    const opts = await synthAudio(stats, {
       vendor: 'google',
       credentials: {
         credentials: {
@@ -109,7 +112,7 @@ test('Google speech Custom voice synth tests', async(t) => {
       language: 'en-AU',
       text: 'This is a test.  This is only a test',
       voice: {
-        reportedUsage:"REALTIME",
+        reportedUsage: 'REALTIME',
         model: process.env.GCP_CUSTOM_VOICE_MODEL
       }
     });
@@ -121,6 +124,48 @@ test('Google speech Custom voice synth tests', async(t) => {
   client.quit();
 });
 
+test('Google speech voice cloning synth tests', async(t) => {
+  const fn = require('..');
+  const {synthAudio, client} = fn(opts, logger);
+
+  if (!process.env.GCP_CUSTOM_VOICE_FILE &&
+    !process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
+    !process.env.GCP_VOICE_CLONING_FILE &&
+    !process.env.GCP_VOICE_CLONING_JSON_KEY) {
+    t.pass(`skipping google speech synth tests since neither 
+GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided,
+GCP_VOICE_CLONING_FILE nor GCP_VOICE_CLONING_JSON_KEY is not provided`);
+    return t.end();
+  }
+  try {
+    const googleKey = process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
+      fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
+    const voice_cloning_key = process.env.GCP_VOICE_CLONING_JSON_KEY ||
+      fs.readFileSync(process.env.GCP_VOICE_CLONING_FILE).toString();
+    const creds = JSON.parse(googleKey);
+    const opts = await synthAudio(stats, {
+      vendor: 'google',
+      credentials: {
+        credentials: {
+          client_email: creds.client_email,
+          private_key: creds.private_key,
+          project_id: creds.project_id
+        },
+      },
+      language: 'en-US',
+      text: 'This is a test. This is only a test. This is a test. This is only a test. This is a test. This is only a test',
+      voice: {
+        voice_cloning_key
+      }
+    });
+    t.ok(!opts.servedFromCache, `successfully synthesized google voice cloning audio to ${opts.filePath}`);
+  } catch (err) {
+    console.error(err);
+    t.end(err);
+  }
+  client.quit();
+});
+
 test('AWS speech synth tests', async(t) => {
   const fn = require('..');
   const {synthAudio, client} = fn(opts, logger);