From 6a288c1db0c7f26eb2f6554a5573dfd2ba009397 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 8 Apr 2024 17:20:32 +0700 Subject: [PATCH 1/5] support mod_playht_tts --- lib/synth-audio.js | 55 ++++++++++++++++++++++++++++++++++++++++++++++ test/synth.js | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index edee6bd..53bb4f7 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -682,6 +682,61 @@ const synthElevenlabs = async(logger, { } }; +const synthPlayHT = async(logger, { + credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming +}) => { + const {api_key, user_id, voice_engine, options: credOpts} = credentials; + const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); + + /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ + if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { + let params = ''; + params += `{api_key=${api_key}`; + params += `,user_id=${user_id}`; + params += ',vendor=playht'; + params += `,voice=${voice}`; + params += `,voice_engine=${voice_engine}`; + params += ',write_cache_file=1'; + if (opts.quality) params += `,quality=${opts.quality}`; + if (opts.speed) params += `,speed=${opts.speed}`; + if (opts.seed) params += `,style=${opts.seed}`; + if (opts.temperature) params += `,temperature=${opts.temperature}`; + if (opts.emotion) params += `,emotion=${opts.emotion}`; + if (opts.voice_guidance) params += `,temperature=${opts.voice_guidance}`; + if (opts.style_guidance) params += `,temperature=${opts.style_guidance}`; + if (opts.text_guidance) params += `,temperature=${opts.text_guidance}`; + params += '}'; + + return { + filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, + servedFromCache: false, + rtt: 0 + }; + } + + try { + const post = bent('https://api.play.ht', 'POST', 'buffer', { + 'AUTHORIZATION': api_key, + 'X-USER-ID': user_id, + 'Accept': 'audio/mpeg', + 'Content-Type': 'application/json' + }); + const mp3 = await post('/api/v2/tts/stream', { + text, + voice, + voice_engine, + output_format: 'mp3', + sample_rate: 8000, + ...opts + }); + return mp3; + } catch (err) { + logger.info({err}, 'synth PlayHT returned error'); + stats.increment('tts.count', ['vendor:playht', 'accepted:no']); + throw err; + } +}; + const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => { const {api_key, model_id, baseURL, timeout, speed} = credentials; /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */ diff --git a/test/synth.js b/test/synth.js index 2fe922b..2fd6fa7 100644 --- a/test/synth.js +++ b/test/synth.js @@ -549,6 +549,47 @@ test('Elevenlabs speech synth tests', async(t) => { client.quit(); }) +test('PlayHT speech synth tests', async(t) => { + const fn = require('..'); + const {synthAudio, client} = fn(opts, logger); + + if (!process.env.PLAYHT_API_KEY || !process.env.PLAYHT_USER_ID) { + t.pass('skipping PlayHT speech synth tests since PLAYHT_API_KEY or PLAYHT_USER_ID is/are not provided'); + return t.end(); + } + const text = 'Hi there and welcome to jambones!'; + try { + let opts = await synthAudio(stats, { + vendor: 'playht', + credentials: { + api_key: process.env.PLAYHT_API_KEY, + user_id: process.env.PLAYHT_USER_ID, + voice_engine: 'PlayHT2.0-turbo', + options: JSON.stringify({ + quality: "medium", + speed: 1, + seed: 1, + temperature: 0, + emotion: "female_happy", + voice_guidance: 3, + style_guidance: 20, + text_guidance: 1, + }) + }, + language: 'en-US', + voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json', + text, + renderForCaching: true + }); + t.ok(!opts.servedFromCache, `successfully synthesized eleven audio to ${opts.filePath}`); + + } catch (err) { + console.error(JSON.stringify(err)); + t.end(err); + } + client.quit(); +}) + test('whisper speech synth tests', async(t) => { const fn = require('..'); const {synthAudio, client} = fn(opts, logger); From 282d87f922fd29de1885ebea82cfd3b959d419a5 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 8 Apr 2024 17:24:15 +0700 Subject: [PATCH 2/5] wip --- lib/synth-audio.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 53bb4f7..34358b7 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -123,6 +123,11 @@ async function synthAudio(client, logger, stats, { account_sid, assert.ok(voice, 'synthAudio requires voice when elevenlabs is used'); assert.ok(credentials.api_key, 'synthAudio requires api_key when elevenlabs is used'); assert.ok(credentials.model_id, 'synthAudio requires model_id when elevenlabs is used'); + } else if ('playht' === vendor) { + assert.ok(voice, 'synthAudio requires voice when playht is used'); + assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used'); + assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used'); + assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used'); } else if ('whisper' === vendor) { assert.ok(voice, 'synthAudio requires voice when whisper is used'); assert.ok(credentials.model_id, 'synthAudio requires model when whisper is used'); @@ -206,6 +211,12 @@ async function synthAudio(client, logger, stats, { account_sid, }); if (audioBuffer?.filePath) return audioBuffer; break; + case 'playht': + audioBuffer = await synthPlayHT(logger, { + credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath + }); + if (audioBuffer?.filePath) return audioBuffer; + break; case 'whisper': audioBuffer = await synthWhisper(logger, { credentials, stats, voice, text, renderForCaching, disableTtsStreaming}); From cc8963802fea8e0816767026a01fb88f3caa766a Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 8 Apr 2024 17:33:21 +0700 Subject: [PATCH 3/5] wip --- lib/synth-audio.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 34358b7..210aa89 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -85,7 +85,7 @@ async function synthAudio(client, logger, stats, { account_sid, logger = logger || noopLogger; assert.ok(['google', 'aws', 'polly', 'microsoft', - 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram'].includes(vendor) || + 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram', 'playht'].includes(vendor) || vendor.startsWith('custom'), `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`); if ('google' === vendor) { From 545d559e278fc471737749168823a61639ac27ba Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 8 Apr 2024 19:14:58 +0700 Subject: [PATCH 4/5] wip --- test/synth.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/synth.js b/test/synth.js index 2fd6fa7..eea1c99 100644 --- a/test/synth.js +++ b/test/synth.js @@ -569,7 +569,7 @@ test('PlayHT speech synth tests', async(t) => { quality: "medium", speed: 1, seed: 1, - temperature: 0, + temperature: 1, emotion: "female_happy", voice_guidance: 3, style_guidance: 20, From 83824914777025e118681370e382660bfda83c31 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 8 Apr 2024 20:31:56 +0700 Subject: [PATCH 5/5] wip --- lib/synth-audio.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 210aa89..daade4c 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -713,9 +713,9 @@ const synthPlayHT = async(logger, { if (opts.seed) params += `,style=${opts.seed}`; if (opts.temperature) params += `,temperature=${opts.temperature}`; if (opts.emotion) params += `,emotion=${opts.emotion}`; - if (opts.voice_guidance) params += `,temperature=${opts.voice_guidance}`; - if (opts.style_guidance) params += `,temperature=${opts.style_guidance}`; - if (opts.text_guidance) params += `,temperature=${opts.text_guidance}`; + if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`; + if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`; + if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`; params += '}'; return {