Skip to content

Commit

Permalink
Merge pull request #65 from jambonz/feat/mod_playht_tts
Browse files Browse the repository at this point in the history
support mod_playht_tts
  • Loading branch information
davehorton authored Apr 8, 2024
2 parents 8650328 + 8382491 commit 12a3659
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 1 deletion.
68 changes: 67 additions & 1 deletion lib/synth-audio.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ async function synthAudio(client, logger, stats, { account_sid,
logger = logger || noopLogger;

assert.ok(['google', 'aws', 'polly', 'microsoft',
'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram'].includes(vendor) ||
'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram', 'playht'].includes(vendor) ||
vendor.startsWith('custom'),
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`);
if ('google' === vendor) {
Expand Down Expand Up @@ -123,6 +123,11 @@ async function synthAudio(client, logger, stats, { account_sid,
assert.ok(voice, 'synthAudio requires voice when elevenlabs is used');
assert.ok(credentials.api_key, 'synthAudio requires api_key when elevenlabs is used');
assert.ok(credentials.model_id, 'synthAudio requires model_id when elevenlabs is used');
} else if ('playht' === vendor) {
assert.ok(voice, 'synthAudio requires voice when playht is used');
assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used');
assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used');
assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used');
} else if ('whisper' === vendor) {
assert.ok(voice, 'synthAudio requires voice when whisper is used');
assert.ok(credentials.model_id, 'synthAudio requires model when whisper is used');
Expand Down Expand Up @@ -206,6 +211,12 @@ async function synthAudio(client, logger, stats, { account_sid,
});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'playht':
audioBuffer = await synthPlayHT(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'whisper':
audioBuffer = await synthWhisper(logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
Expand Down Expand Up @@ -682,6 +693,61 @@ const synthElevenlabs = async(logger, {
}
};

const synthPlayHT = async(logger, {
credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
}) => {
const {api_key, user_id, voice_engine, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = '';
params += `{api_key=${api_key}`;
params += `,user_id=${user_id}`;
params += ',vendor=playht';
params += `,voice=${voice}`;
params += `,voice_engine=${voice_engine}`;
params += ',write_cache_file=1';
if (opts.quality) params += `,quality=${opts.quality}`;
if (opts.speed) params += `,speed=${opts.speed}`;
if (opts.seed) params += `,style=${opts.seed}`;
if (opts.temperature) params += `,temperature=${opts.temperature}`;
if (opts.emotion) params += `,emotion=${opts.emotion}`;
if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`;
if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`;
if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`;
params += '}';

return {
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
servedFromCache: false,
rtt: 0
};
}

try {
const post = bent('https://api.play.ht', 'POST', 'buffer', {
'AUTHORIZATION': api_key,
'X-USER-ID': user_id,
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post('/api/v2/tts/stream', {
text,
voice,
voice_engine,
output_format: 'mp3',
sample_rate: 8000,
...opts
});
return mp3;
} catch (err) {
logger.info({err}, 'synth PlayHT returned error');
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
throw err;
}
};

const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials;
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
Expand Down
41 changes: 41 additions & 0 deletions test/synth.js
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,47 @@ test('Elevenlabs speech synth tests', async(t) => {
client.quit();
})

test('PlayHT speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);

if (!process.env.PLAYHT_API_KEY || !process.env.PLAYHT_USER_ID) {
t.pass('skipping PlayHT speech synth tests since PLAYHT_API_KEY or PLAYHT_USER_ID is/are not provided');
return t.end();
}
const text = 'Hi there and welcome to jambones!';
try {
let opts = await synthAudio(stats, {
vendor: 'playht',
credentials: {
api_key: process.env.PLAYHT_API_KEY,
user_id: process.env.PLAYHT_USER_ID,
voice_engine: 'PlayHT2.0-turbo',
options: JSON.stringify({
quality: "medium",
speed: 1,
seed: 1,
temperature: 1,
emotion: "female_happy",
voice_guidance: 3,
style_guidance: 20,
text_guidance: 1,
})
},
language: 'en-US',
voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
text,
renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized eleven audio to ${opts.filePath}`);

} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
client.quit();
})

test('whisper speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
Expand Down

0 comments on commit 12a3659

Please sign in to comment.