diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 310dd7b..3591071 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -170,7 +170,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc renderForCaching }); let filePath; - filePath = makeFilePath({vendor, key, salt, renderForCaching}); + filePath = makeFilePath({vendor, voice, key, salt, renderForCaching}); debug(`synth key is ${key}`); let cached; if (!disableTtsCache) { @@ -192,7 +192,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc cached = await client.get(preCachekey); if (cached) { // Precache audio is available update filpath with precache file extension. - filePath = makeFilePath({vendor, key, salt, renderForCaching: true}); + filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true}); } } } @@ -353,6 +353,44 @@ const synthPolly = async(createHash, retrieveHash, logger, const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => { const client = new ttsGoogle.TextToSpeechClient(credentials); + // If google custom voice cloning is used. + // At this time 31 Oct 2024, google node sdk has not support voice cloning yet. + if (typeof voice === 'object' && voice.voice_cloning_key) { + try { + const accessToken = await client.auth.getAccessToken(); + const projectId = await client.getProjectId(); + + const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', { + 'Authorization': `Bearer ${accessToken}`, + 'x-goog-user-project': projectId, + 'Content-Type': 'application/json; charset=utf-8' + }); + + const payload = { + input: { + text + }, + voice: { + language_code: language, + voice_clone: { + voice_cloning_key: voice.voice_cloning_key + } + }, + audioConfig: { + // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz + audioEncoding: 'LINEAR16', + sample_rate_hertz: 24000 + } + }; + + const wav = await post('/v1beta1/text:synthesize', payload); + return Buffer.from(wav.audioContent, 'base64'); + } catch (err) { + logger.info({err: await err.text()}, 'synthGoogle returned error'); + throw err; + } + } + const opts = { voice: { ...(typeof voice === 'string' && {name: voice}), diff --git a/lib/utils.js b/lib/utils.js index 620f469..86b74d7 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -23,19 +23,20 @@ function makeSynthKey({ hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`); const hexHashKey = hash.digest('hex'); const accountKey = account_sid ? `:${account_sid}` : ''; - const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, renderForCaching}); + const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching}); const key = `tts${accountKey}:${namespace}:${hexHashKey}`; return key; } -function makeFilePath({vendor, key, salt = '', renderForCaching = false}) { - const extension = getFileExtension({vendor, renderForCaching}); +function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) { + const extension = getFileExtension({vendor, renderForCaching, voice}); return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`; } -function getFileExtension({vendor, renderForCaching = false}) { +function getFileExtension({vendor, voice, renderForCaching = false}) { const mp3Extension = 'mp3'; const r8Extension = 'r8'; + const wavExtension = 'wav'; switch (vendor) { case 'azure': @@ -58,6 +59,13 @@ function getFileExtension({vendor, renderForCaching = false}) { case 'nvidia': case 'verbio': return r8Extension; + case 'google': + // google voice cloning just support wav. + if (typeof voice === 'object' && voice.voice_cloning_key) { + return wavExtension; + } else { + return mp3Extension; + } default: // If vendor is custom if (vendor.startsWith('custom')) { diff --git a/package-lock.json b/package-lock.json index 942ae66..db3166f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,7 +11,7 @@ "dependencies": { "@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0", - "@google-cloud/text-to-speech": "^5.0.2", + "@google-cloud/text-to-speech": "^5.5.0", "@grpc/grpc-js": "^1.9.14", "@jambonz/realtimedb-helpers": "^0.8.7", "bent": "^7.3.12", @@ -1230,9 +1230,10 @@ } }, "node_modules/@google-cloud/text-to-speech": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.0.2.tgz", - "integrity": "sha512-Q11Ddh9eHKSDA3E/KSqMITgVprXb0XgIKuJP9F5ScJ1T9h+DNrbgIU7shd0QOlPqb8ruQRiTOqL08+Mq5R89Ow==", + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.5.0.tgz", + "integrity": "sha512-Cw/UK2Y3l31Vsuozu8cxsmVS/09fShimes0tRLgDbOY2ZMG1Dckb6Zf/Q3Nxg4X0feFep44pvwNmyHKrOnl9SQ==", + "license": "Apache-2.0", "dependencies": { "google-gax": "^4.0.3" }, @@ -8338,9 +8339,9 @@ "dev": true }, "@google-cloud/text-to-speech": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.0.2.tgz", - "integrity": "sha512-Q11Ddh9eHKSDA3E/KSqMITgVprXb0XgIKuJP9F5ScJ1T9h+DNrbgIU7shd0QOlPqb8ruQRiTOqL08+Mq5R89Ow==", + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.5.0.tgz", + "integrity": "sha512-Cw/UK2Y3l31Vsuozu8cxsmVS/09fShimes0tRLgDbOY2ZMG1Dckb6Zf/Q3Nxg4X0feFep44pvwNmyHKrOnl9SQ==", "requires": { "google-gax": "^4.0.3" } diff --git a/package.json b/package.json index 2c634dd..a5be980 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ "dependencies": { "@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0", - "@google-cloud/text-to-speech": "^5.0.2", + "@google-cloud/text-to-speech": "^5.5.0", "@grpc/grpc-js": "^1.9.14", "@jambonz/realtimedb-helpers": "^0.8.7", "bent": "^7.3.12", diff --git a/test/synth.js b/test/synth.js index a160d2b..35f83b3 100644 --- a/test/synth.js +++ b/test/synth.js @@ -91,14 +91,17 @@ test('Google speech Custom voice synth tests', async(t) => { const fn = require('..'); const {synthAudio, client} = fn(opts, logger); - if (!process.env.GCP_CUSTOM_VOICE_FILE && !process.env.GCP_CUSTOM_VOICE_JSON_KEY || !process.env.GCP_CUSTOM_VOICE_MODEL) { - t.pass('skipping google speech synth tests since neither GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided'); + if (!process.env.GCP_CUSTOM_VOICE_FILE && + !process.env.GCP_CUSTOM_VOICE_JSON_KEY || + !process.env.GCP_CUSTOM_VOICE_MODEL) { + t.pass(`skipping google speech synth tests since neither +GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided`); return t.end(); } try { const str = process.env.GCP_CUSTOM_VOICE_JSON_KEY || fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE); const creds = JSON.parse(str); - let opts = await synthAudio(stats, { + const opts = await synthAudio(stats, { vendor: 'google', credentials: { credentials: { @@ -109,7 +112,7 @@ test('Google speech Custom voice synth tests', async(t) => { language: 'en-AU', text: 'This is a test. This is only a test', voice: { - reportedUsage:"REALTIME", + reportedUsage: 'REALTIME', model: process.env.GCP_CUSTOM_VOICE_MODEL } }); @@ -121,6 +124,48 @@ test('Google speech Custom voice synth tests', async(t) => { client.quit(); }); +test('Google speech voice cloning synth tests', async(t) => { + const fn = require('..'); + const {synthAudio, client} = fn(opts, logger); + + if (!process.env.GCP_CUSTOM_VOICE_FILE && + !process.env.GCP_CUSTOM_VOICE_JSON_KEY || + !process.env.GCP_VOICE_CLONING_FILE && + !process.env.GCP_VOICE_CLONING_JSON_KEY) { + t.pass(`skipping google speech synth tests since neither +GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, +GCP_VOICE_CLONING_FILE nor GCP_VOICE_CLONING_JSON_KEY is not provided`); + return t.end(); + } + try { + const googleKey = process.env.GCP_CUSTOM_VOICE_JSON_KEY || + fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE); + const voice_cloning_key = process.env.GCP_VOICE_CLONING_JSON_KEY || + fs.readFileSync(process.env.GCP_VOICE_CLONING_FILE).toString(); + const creds = JSON.parse(googleKey); + const opts = await synthAudio(stats, { + vendor: 'google', + credentials: { + credentials: { + client_email: creds.client_email, + private_key: creds.private_key, + project_id: creds.project_id + }, + }, + language: 'en-US', + text: 'This is a test. This is only a test. This is a test. This is only a test. This is a test. This is only a test', + voice: { + voice_cloning_key + } + }); + t.ok(!opts.servedFromCache, `successfully synthesized google voice cloning audio to ${opts.filePath}`); + } catch (err) { + console.error(err); + t.end(err); + } + client.quit(); +}); + test('AWS speech synth tests', async(t) => { const fn = require('..'); const {synthAudio, client} = fn(opts, logger);