Skip to content

Commit

Permalink
Merge pull request #97 from jambonz/feat/google_voice_cloning
Browse files Browse the repository at this point in the history
support google voice cloning
  • Loading branch information
davehorton authored Oct 31, 2024
2 parents f183852 + 153ac3f commit 63efecf
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 18 deletions.
42 changes: 40 additions & 2 deletions lib/synth-audio.js
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
renderForCaching
});
let filePath;
filePath = makeFilePath({vendor, key, salt, renderForCaching});
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
debug(`synth key is ${key}`);
let cached;
if (!disableTtsCache) {
Expand All @@ -192,7 +192,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
cached = await client.get(preCachekey);
if (cached) {
// Precache audio is available update filpath with precache file extension.
filePath = makeFilePath({vendor, key, salt, renderForCaching: true});
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
}
}
}
Expand Down Expand Up @@ -353,6 +353,44 @@ const synthPolly = async(createHash, retrieveHash, logger,

const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
const client = new ttsGoogle.TextToSpeechClient(credentials);
// If google custom voice cloning is used.
// At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
if (typeof voice === 'object' && voice.voice_cloning_key) {
try {
const accessToken = await client.auth.getAccessToken();
const projectId = await client.getProjectId();

const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
'Authorization': `Bearer ${accessToken}`,
'x-goog-user-project': projectId,
'Content-Type': 'application/json; charset=utf-8'
});

const payload = {
input: {
text
},
voice: {
language_code: language,
voice_clone: {
voice_cloning_key: voice.voice_cloning_key
}
},
audioConfig: {
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
audioEncoding: 'LINEAR16',
sample_rate_hertz: 24000
}
};

const wav = await post('/v1beta1/text:synthesize', payload);
return Buffer.from(wav.audioContent, 'base64');
} catch (err) {
logger.info({err: await err.text()}, 'synthGoogle returned error');
throw err;
}
}

const opts = {
voice: {
...(typeof voice === 'string' && {name: voice}),
Expand Down
16 changes: 12 additions & 4 deletions lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,20 @@ function makeSynthKey({
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
const hexHashKey = hash.digest('hex');
const accountKey = account_sid ? `:${account_sid}` : '';
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, renderForCaching});
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
return key;
}

function makeFilePath({vendor, key, salt = '', renderForCaching = false}) {
const extension = getFileExtension({vendor, renderForCaching});
function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
const extension = getFileExtension({vendor, renderForCaching, voice});
return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
}

function getFileExtension({vendor, renderForCaching = false}) {
function getFileExtension({vendor, voice, renderForCaching = false}) {
const mp3Extension = 'mp3';
const r8Extension = 'r8';
const wavExtension = 'wav';

switch (vendor) {
case 'azure':
Expand All @@ -58,6 +59,13 @@ function getFileExtension({vendor, renderForCaching = false}) {
case 'nvidia':
case 'verbio':
return r8Extension;
case 'google':
// google voice cloning just support wav.
if (typeof voice === 'object' && voice.voice_cloning_key) {
return wavExtension;
} else {
return mp3Extension;
}
default:
// If vendor is custom
if (vendor.startsWith('custom')) {
Expand Down
15 changes: 8 additions & 7 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"dependencies": {
"@aws-sdk/client-polly": "^3.496.0",
"@aws-sdk/client-sts": "^3.496.0",
"@google-cloud/text-to-speech": "^5.0.2",
"@google-cloud/text-to-speech": "^5.5.0",
"@grpc/grpc-js": "^1.9.14",
"@jambonz/realtimedb-helpers": "^0.8.7",
"bent": "^7.3.12",
Expand Down
53 changes: 49 additions & 4 deletions test/synth.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,17 @@ test('Google speech Custom voice synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);

if (!process.env.GCP_CUSTOM_VOICE_FILE && !process.env.GCP_CUSTOM_VOICE_JSON_KEY || !process.env.GCP_CUSTOM_VOICE_MODEL) {
t.pass('skipping google speech synth tests since neither GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided');
if (!process.env.GCP_CUSTOM_VOICE_FILE &&
!process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
!process.env.GCP_CUSTOM_VOICE_MODEL) {
t.pass(`skipping google speech synth tests since neither
GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided`);
return t.end();
}
try {
const str = process.env.GCP_CUSTOM_VOICE_JSON_KEY || fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
const creds = JSON.parse(str);
let opts = await synthAudio(stats, {
const opts = await synthAudio(stats, {
vendor: 'google',
credentials: {
credentials: {
Expand All @@ -109,7 +112,7 @@ test('Google speech Custom voice synth tests', async(t) => {
language: 'en-AU',
text: 'This is a test. This is only a test',
voice: {
reportedUsage:"REALTIME",
reportedUsage: 'REALTIME',
model: process.env.GCP_CUSTOM_VOICE_MODEL
}
});
Expand All @@ -121,6 +124,48 @@ test('Google speech Custom voice synth tests', async(t) => {
client.quit();
});

test('Google speech voice cloning synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);

if (!process.env.GCP_CUSTOM_VOICE_FILE &&
!process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
!process.env.GCP_VOICE_CLONING_FILE &&
!process.env.GCP_VOICE_CLONING_JSON_KEY) {
t.pass(`skipping google speech synth tests since neither
GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided,
GCP_VOICE_CLONING_FILE nor GCP_VOICE_CLONING_JSON_KEY is not provided`);
return t.end();
}
try {
const googleKey = process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
const voice_cloning_key = process.env.GCP_VOICE_CLONING_JSON_KEY ||
fs.readFileSync(process.env.GCP_VOICE_CLONING_FILE).toString();
const creds = JSON.parse(googleKey);
const opts = await synthAudio(stats, {
vendor: 'google',
credentials: {
credentials: {
client_email: creds.client_email,
private_key: creds.private_key,
project_id: creds.project_id
},
},
language: 'en-US',
text: 'This is a test. This is only a test. This is a test. This is only a test. This is a test. This is only a test',
voice: {
voice_cloning_key
}
});
t.ok(!opts.servedFromCache, `successfully synthesized google voice cloning audio to ${opts.filePath}`);
} catch (err) {
console.error(err);
t.end(err);
}
client.quit();
});

test('AWS speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
Expand Down

0 comments on commit 63efecf

Please sign in to comment.