Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support mod_playht_tts #65

Merged
merged 5 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 67 additions & 1 deletion lib/synth-audio.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ async function synthAudio(client, logger, stats, { account_sid,
logger = logger || noopLogger;

assert.ok(['google', 'aws', 'polly', 'microsoft',
'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram'].includes(vendor) ||
'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram', 'playht'].includes(vendor) ||
vendor.startsWith('custom'),
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`);
if ('google' === vendor) {
Expand Down Expand Up @@ -123,6 +123,11 @@ async function synthAudio(client, logger, stats, { account_sid,
assert.ok(voice, 'synthAudio requires voice when elevenlabs is used');
assert.ok(credentials.api_key, 'synthAudio requires api_key when elevenlabs is used');
assert.ok(credentials.model_id, 'synthAudio requires model_id when elevenlabs is used');
} else if ('playht' === vendor) {
assert.ok(voice, 'synthAudio requires voice when playht is used');
assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used');
assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used');
assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used');
} else if ('whisper' === vendor) {
assert.ok(voice, 'synthAudio requires voice when whisper is used');
assert.ok(credentials.model_id, 'synthAudio requires model when whisper is used');
Expand Down Expand Up @@ -206,6 +211,12 @@ async function synthAudio(client, logger, stats, { account_sid,
});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'playht':
audioBuffer = await synthPlayHT(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
if (audioBuffer?.filePath) return audioBuffer;
break;
case 'whisper':
audioBuffer = await synthWhisper(logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
Expand Down Expand Up @@ -682,6 +693,61 @@ const synthElevenlabs = async(logger, {
}
};

const synthPlayHT = async(logger, {
credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
}) => {
const {api_key, user_id, voice_engine, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = '';
params += `{api_key=${api_key}`;
params += `,user_id=${user_id}`;
params += ',vendor=playht';
params += `,voice=${voice}`;
params += `,voice_engine=${voice_engine}`;
params += ',write_cache_file=1';
if (opts.quality) params += `,quality=${opts.quality}`;
if (opts.speed) params += `,speed=${opts.speed}`;
if (opts.seed) params += `,style=${opts.seed}`;
if (opts.temperature) params += `,temperature=${opts.temperature}`;
if (opts.emotion) params += `,emotion=${opts.emotion}`;
if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`;
if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`;
if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`;
params += '}';

return {
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
servedFromCache: false,
rtt: 0
};
}

try {
const post = bent('https://api.play.ht', 'POST', 'buffer', {
'AUTHORIZATION': api_key,
'X-USER-ID': user_id,
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post('/api/v2/tts/stream', {
text,
voice,
voice_engine,
output_format: 'mp3',
sample_rate: 8000,
...opts
});
return mp3;
} catch (err) {
logger.info({err}, 'synth PlayHT returned error');
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
throw err;
}
};

const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials;
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
Expand Down
41 changes: 41 additions & 0 deletions test/synth.js
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,47 @@ test('Elevenlabs speech synth tests', async(t) => {
client.quit();
})

test('PlayHT speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);

if (!process.env.PLAYHT_API_KEY || !process.env.PLAYHT_USER_ID) {
t.pass('skipping PlayHT speech synth tests since PLAYHT_API_KEY or PLAYHT_USER_ID is/are not provided');
return t.end();
}
const text = 'Hi there and welcome to jambones!';
try {
let opts = await synthAudio(stats, {
vendor: 'playht',
credentials: {
api_key: process.env.PLAYHT_API_KEY,
user_id: process.env.PLAYHT_USER_ID,
voice_engine: 'PlayHT2.0-turbo',
options: JSON.stringify({
quality: "medium",
speed: 1,
seed: 1,
temperature: 1,
emotion: "female_happy",
voice_guidance: 3,
style_guidance: 20,
text_guidance: 1,
})
},
language: 'en-US',
voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
text,
renderForCaching: true
});
t.ok(!opts.servedFromCache, `successfully synthesized eleven audio to ${opts.filePath}`);

} catch (err) {
console.error(JSON.stringify(err));
t.end(err);
}
client.quit();
})

test('whisper speech synth tests', async(t) => {
const fn = require('..');
const {synthAudio, client} = fn(opts, logger);
Expand Down
Loading