diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts index 732b574c883..1b952913088 100644 --- a/packages/components/src/speechToText.ts +++ b/packages/components/src/speechToText.ts @@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai' import { AssemblyAI } from 'assemblyai' import { getFileFromStorage } from './storageUtils' +const SpeechToTextType = { + OPENAI_WHISPER: 'openAIWhisper', + ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', + LOCALAI_STT: 'localAISTT' +} + export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => { if (speechToTextConfig) { const credentialId = speechToTextConfig.credentialId as string const credentialData = await getCredentialData(credentialId ?? '', options) const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId) - if (speechToTextConfig.name === 'openAIWhisper') { - const openAIClientOptions: ClientOptions = { - apiKey: credentialData.openAIApiKey - } - const openAIClient = new OpenAIClient(openAIClientOptions) - const transcription = await openAIClient.audio.transcriptions.create({ - file: new File([new Blob([audio_file])], upload.name), - model: 'whisper-1', - language: speechToTextConfig?.language, - temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined, - prompt: speechToTextConfig?.prompt - }) - if (transcription?.text) { - return transcription.text + switch (speechToTextConfig.name) { + case SpeechToTextType.OPENAI_WHISPER: { + const openAIClientOptions: ClientOptions = { + apiKey: credentialData.openAIApiKey + } + const openAIClient = new OpenAIClient(openAIClientOptions) + const openAITranscription = await openAIClient.audio.transcriptions.create({ + file: new File([new Blob([audio_file])], upload.name), + model: 'whisper-1', + language: speechToTextConfig?.language, + temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined, + prompt: speechToTextConfig?.prompt + }) + if (openAITranscription?.text) { + return openAITranscription.text + } + break } - } else if (speechToTextConfig.name === 'assemblyAiTranscribe') { - const client = new AssemblyAI({ - apiKey: credentialData.assemblyAIApiKey - }) + case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: { + const assemblyAIClient = new AssemblyAI({ + apiKey: credentialData.assemblyAIApiKey + }) - const params = { - audio: audio_file, - speaker_labels: false - } + const params = { + audio: audio_file, + speaker_labels: false + } - const transcription = await client.transcripts.transcribe(params) - if (transcription?.text) { - return transcription.text + const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params) + if (assemblyAITranscription?.text) { + return assemblyAITranscription.text + } + break + } + case SpeechToTextType.LOCALAI_STT: { + const LocalAIClientOptions: ClientOptions = { + apiKey: credentialData.localAIApiKey, + baseURL: speechToTextConfig?.baseUrl + } + const localAIClient = new OpenAIClient(LocalAIClientOptions) + const localAITranscription = await localAIClient.audio.transcriptions.create({ + file: new File([new Blob([audio_file])], upload.name), + model: speechToTextConfig?.model || 'whisper-1', + language: speechToTextConfig?.language, + temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined, + prompt: speechToTextConfig?.prompt + }) + if (localAITranscription?.text) { + return localAITranscription.text + } + break } } } else { diff --git a/packages/server/src/utils/buildChatflow.ts b/packages/server/src/utils/buildChatflow.ts index 3ffefff326e..df946f98b9d 100644 --- a/packages/server/src/utils/buildChatflow.ts +++ b/packages/server/src/utils/buildChatflow.ts @@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter } // Run Speech to Text conversion - if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') { + if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') { + logger.debug(`Attempting a speech to text conversion...`) let speechToTextConfig: ICommonObject = {} if (chatflow.speechToText) { const speechToTextProviders = JSON.parse(chatflow.speechToText) @@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter databaseEntities: databaseEntities } const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options) + logger.debug(`Speech to text result: ${speechToTextResult}`) if (speechToTextResult) { incomingInput.question = speechToTextResult } diff --git a/packages/ui/src/assets/images/localai.png b/packages/ui/src/assets/images/localai.png new file mode 100644 index 00000000000..321403973da Binary files /dev/null and b/packages/ui/src/assets/images/localai.png differ diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx index 3c8fb674e0e..ee376ddc5e1 100644 --- a/packages/ui/src/ui-component/extended/SpeechToText.jsx +++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx @@ -16,6 +16,7 @@ import { StyledButton } from '@/ui-component/button/StyledButton' import { Dropdown } from '@/ui-component/dropdown/Dropdown' import openAISVG from '@/assets/images/openai.svg' import assemblyAIPng from '@/assets/images/assemblyai.png' +import localAiPng from '@/assets/images/localai.png' // store import useNotifier from '@/utils/useNotifier' @@ -23,10 +24,19 @@ import useNotifier from '@/utils/useNotifier' // API import chatflowsApi from '@/api/chatflows' +// If implementing a new provider, this must be updated in +// components/src/speechToText.ts as well +const SpeechToTextType = { + OPENAI_WHISPER: 'openAIWhisper', + ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe', + LOCALAI_STT: 'localAISTT' +} + +// Weird quirk - the key must match the name property value. const speechToTextProviders = { - openAIWhisper: { + [SpeechToTextType.OPENAI_WHISPER]: { label: 'OpenAI Whisper', - name: 'openAIWhisper', + name: SpeechToTextType.OPENAI_WHISPER, icon: openAISVG, url: 'https://platform.openai.com/docs/guides/speech-to-text', inputs: [ @@ -63,9 +73,9 @@ const speechToTextProviders = { } ] }, - assemblyAiTranscribe: { + [SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: { label: 'Assembly AI', - name: 'assemblyAiTranscribe', + name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE, icon: assemblyAIPng, url: 'https://www.assemblyai.com/', inputs: [ @@ -76,6 +86,59 @@ const speechToTextProviders = { credentialNames: ['assemblyAIApi'] } ] + }, + [SpeechToTextType.LOCALAI_STT]: { + label: 'LocalAi STT', + name: SpeechToTextType.LOCALAI_STT, + icon: localAiPng, + url: 'https://localai.io/features/audio-to-text/', + inputs: [ + { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['localAIApi'] + }, + { + label: 'Base URL', + name: 'baseUrl', + type: 'string', + description: 'The base URL of the local AI server' + }, + { + label: 'Language', + name: 'language', + type: 'string', + description: + 'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.', + placeholder: 'en', + optional: true + }, + { + label: 'Model', + name: 'model', + type: 'string', + description: `The STT model to load. Defaults to whisper-1 if left blank.`, + placeholder: 'whisper-1', + optional: true + }, + { + label: 'Prompt', + name: 'prompt', + type: 'string', + rows: 4, + description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`, + optional: true + }, + { + label: 'Temperature', + name: 'temperature', + type: 'number', + step: 0.1, + description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`, + optional: true + } + ] } } @@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {