From 2f410fc09f62e67c32ac6142e99937d3e8f29601 Mon Sep 17 00:00:00 2001 From: DDMeaqua Date: Tue, 27 Aug 2024 16:21:02 +0800 Subject: [PATCH 1/7] feat: add tts stt --- app/client/api.ts | 22 ++ app/client/platforms/openai.ts | 83 ++++++- app/components/chat.tsx | 150 ++++++++++++- app/components/settings.tsx | 24 ++ app/components/stt-config.tsx | 51 +++++ app/components/stt.module.scss | 119 ++++++++++ app/components/tts-config.tsx | 132 +++++++++++ app/components/tts.module.scss | 119 ++++++++++ app/constant.ts | 20 ++ app/icons/speak-stop.svg | 1 + app/icons/speak.svg | 1 + app/icons/voice-white.svg | 16 ++ app/locales/cn.ts | 34 +++ app/locales/en.ts | 2 + app/locales/index.ts | 31 +++ app/store/access.ts | 9 + app/store/config.ts | 49 +++++ app/utils/audio.ts | 45 ++++ app/utils/ms_edge_tts.ts | 391 +++++++++++++++++++++++++++++++++ app/utils/speech.ts | 126 +++++++++++ package.json | 3 +- yarn.lock | 24 ++ 22 files changed, 1446 insertions(+), 6 deletions(-) create mode 100644 app/components/stt-config.tsx create mode 100644 app/components/stt.module.scss create mode 100644 app/components/tts-config.tsx create mode 100644 app/components/tts.module.scss create mode 100644 app/icons/speak-stop.svg create mode 100644 app/icons/speak.svg create mode 100644 app/icons/voice-white.svg create mode 100644 app/utils/audio.ts create mode 100644 app/utils/ms_edge_tts.ts create mode 100644 app/utils/speech.ts diff --git a/app/client/api.ts b/app/client/api.ts index d7fb023a226..8d0877a0d4d 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -20,6 +20,7 @@ export const ROLES = ["system", "user", "assistant"] as const; export type MessageRole = (typeof ROLES)[number]; export const Models = ["gpt-3.5-turbo", "gpt-4"] as const; +export const TTSModels = ["tts-1", "tts-1-hd"] as const; export type ChatModel = ModelType; export interface MultimodalContent { @@ -48,6 +49,25 @@ export interface LLMConfig { style?: DalleRequestPayload["style"]; } +export interface SpeechOptions { + model: string; + input: string; + voice: string; + response_format?: string; + speed?: number; + onController?: (controller: AbortController) => void; +} + +export interface TranscriptionOptions { + model?: "whisper-1"; + file: Blob; + language?: string; + prompt?: string; + response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt"; + temperature?: number; + onController?: (controller: AbortController) => void; +} + export interface ChatOptions { messages: RequestMessage[]; config: LLMConfig; @@ -80,6 +100,8 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; + abstract speech(options: SpeechOptions): Promise; + abstract transcription(options: TranscriptionOptions): Promise; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts index d4e262c16b4..02115140b72 100644 --- a/app/client/platforms/openai.ts +++ b/app/client/platforms/openai.ts @@ -26,6 +26,8 @@ import { LLMModel, LLMUsage, MultimodalContent, + SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { @@ -77,7 +79,7 @@ export interface DalleRequestPayload { export class ChatGPTApi implements LLMApi { private disableListModels = true; - path(path: string): string { + path(path: string, model?: string): string { const accessStore = useAccessStore.getState(); let baseUrl = ""; @@ -140,6 +142,85 @@ export class ChatGPTApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? res; } + async speech(options: SpeechOptions): Promise { + const requestPayload = { + model: options.model, + input: options.input, + voice: options.voice, + response_format: options.response_format, + speed: options.speed, + }; + + console.log("[Request] openai speech payload: ", requestPayload); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const speechPath = this.path(OpenaiPath.SpeechPath, options.model); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: getHeaders(), + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + + const res = await fetch(speechPath, speechPayload); + clearTimeout(requestTimeoutId); + return await res.arrayBuffer(); + } catch (e) { + console.log("[Request] failed to make a speech request", e); + throw e; + } + } + + async transcription(options: TranscriptionOptions): Promise { + const formData = new FormData(); + formData.append("file", options.file, "audio.wav"); + formData.append("model", options.model ?? "whisper-1"); + if (options.language) formData.append("language", options.language); + if (options.prompt) formData.append("prompt", options.prompt); + if (options.response_format) + formData.append("response_format", options.response_format); + if (options.temperature) + formData.append("temperature", options.temperature.toString()); + + console.log("[Request] openai audio transcriptions payload: ", options); + + const controller = new AbortController(); + options.onController?.(controller); + + try { + const path = this.path(OpenaiPath.TranscriptionPath, options.model); + const headers = getHeaders(true); + const payload = { + method: "POST", + body: formData, + signal: controller.signal, + headers: headers, + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + REQUEST_TIMEOUT_MS, + ); + const res = await fetch(path, payload); + clearTimeout(requestTimeoutId); + const json = await res.json(); + return json.text; + } catch (e) { + console.log("[Request] failed to make a audio transcriptions request", e); + throw e; + } + } + async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, diff --git a/app/components/chat.tsx b/app/components/chat.tsx index ed5b06799c3..e5391ad226c 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -10,11 +10,14 @@ import React, { } from "react"; import SendWhiteIcon from "../icons/send-white.svg"; +import VoiceWhiteIcon from "../icons/voice-white.svg"; import BrainIcon from "../icons/brain.svg"; import RenameIcon from "../icons/rename.svg"; import ExportIcon from "../icons/share.svg"; import ReturnIcon from "../icons/return.svg"; import CopyIcon from "../icons/copy.svg"; +import SpeakIcon from "../icons/speak.svg"; +import SpeakStopIcon from "../icons/speak-stop.svg"; import LoadingIcon from "../icons/three-dots.svg"; import LoadingButtonIcon from "../icons/loading.svg"; import PromptIcon from "../icons/prompt.svg"; @@ -64,6 +67,7 @@ import { getMessageImages, isVisionModel, isDalle3, + isFirefox, } from "../utils"; import { uploadImage as uploadImageRemote } from "@/app/utils/chat"; @@ -73,7 +77,7 @@ import dynamic from "next/dynamic"; import { ChatControllerPool } from "../client/controller"; import { DalleSize, DalleQuality, DalleStyle } from "../typing"; import { Prompt, usePromptStore } from "../store/prompt"; -import Locale from "../locales"; +import Locale, { getLang, getSTTLang } from "../locales"; import { IconButton } from "./button"; import styles from "./chat.module.scss"; @@ -90,6 +94,10 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, + DEFAULT_STT_ENGINE, + DEFAULT_TTS_ENGINE, + FIREFOX_DEFAULT_STT_ENGINE, + ModelProvider, LAST_INPUT_KEY, Path, REQUEST_TIMEOUT_MS, @@ -106,6 +114,16 @@ import { ExportMessageModal } from "./exporter"; import { getClientConfig } from "../config/client"; import { useAllModels } from "../utils/hooks"; import { MultimodalContent } from "../client/api"; +import { ClientApi } from "../client/api"; +import { createTTSPlayer } from "../utils/audio"; +import { + OpenAITranscriptionApi, + SpeechApi, + WebTranscriptionApi, +} from "../utils/speech"; +import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts"; + +const ttsPlayer = createTTSPlayer(); const Markdown = dynamic(async () => (await import("./markdown")).Markdown, { loading: () => , @@ -922,6 +940,33 @@ function _Chat() { } }; + const [isListening, setIsListening] = useState(false); + const [isTranscription, setIsTranscription] = useState(false); + const [speechApi, setSpeechApi] = useState(null); + + const startListening = async () => { + if (speechApi) { + await speechApi.start(); + setIsListening(true); + } + }; + + const stopListening = async () => { + if (speechApi) { + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(true); + await speechApi.stop(); + setIsListening(false); + } + }; + + const onRecognitionEnd = (finalTranscript: string) => { + console.log(finalTranscript); + if (finalTranscript) setUserInput(finalTranscript); + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(false); + }; + const doSubmit = (userInput: string) => { if (userInput.trim() === "") return; const matchCommand = chatCommands.match(userInput); @@ -992,6 +1037,16 @@ function _Chat() { } }); // eslint-disable-next-line react-hooks/exhaustive-deps + if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE; + setSpeechApi( + config.sttConfig.engine === DEFAULT_STT_ENGINE + ? new WebTranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ) + : new OpenAITranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ), + ); }, []); // check if should send message @@ -1102,10 +1157,55 @@ function _Chat() { }); }; + const accessStore = useAccessStore(); + const [speechStatus, setSpeechStatus] = useState(false); + const [speechLoading, setSpeechLoading] = useState(false); + async function openaiSpeech(text: string) { + if (speechStatus) { + ttsPlayer.stop(); + setSpeechStatus(false); + } else { + var api: ClientApi; + api = new ClientApi(ModelProvider.GPT); + const config = useAppConfig.getState(); + setSpeechLoading(true); + ttsPlayer.init(); + let audioBuffer: ArrayBuffer; + const { markdownToTxt } = require("markdown-to-txt"); + const textContent = markdownToTxt(text); + if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + const edgeVoiceName = accessStore.edgeVoiceName(); + const tts = new MsEdgeTTS(); + await tts.setMetadata( + edgeVoiceName, + OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, + ); + audioBuffer = await tts.toArrayBuffer(textContent); + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + } + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); - const accessStore = useAccessStore(); if ( context.length === 0 && @@ -1567,6 +1667,26 @@ function _Chat() { ) } /> + {config.ttsConfig.enable && ( + + ) : ( + + ) + } + onClick={() => + openaiSpeech(getMessageTextContent(message)) + } + /> + )} )} @@ -1714,13 +1834,35 @@ function _Chat() { })} )} - } + text={ + isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak + } + className={styles["chat-input-send"]} + type="primary" + onClick={async () => + isListening ? await stopListening() : await startListening() + } + loding={isTranscription} + /> + ) : ( + } + text={Locale.Chat.Send} + className={styles["chat-input-send"]} + type="primary" + onClick={() => doSubmit(userInput)} + /> + )} + {/* } text={Locale.Chat.Send} className={styles["chat-input-send"]} type="primary" onClick={() => doSubmit(userInput)} - /> + /> */} diff --git a/app/components/settings.tsx b/app/components/settings.tsx index ca0a5a18796..47a72d79de7 100644 --- a/app/components/settings.tsx +++ b/app/components/settings.tsx @@ -80,6 +80,8 @@ import { useSyncStore } from "../store/sync"; import { nanoid } from "nanoid"; import { useMaskStore } from "../store/mask"; import { ProviderType } from "../utils/cloud"; +import { TTSConfigList } from "./tts-config"; +import { STTConfigList } from "./stt-config"; function EditPromptModal(props: { id: string; onClose: () => void }) { const promptStore = usePromptStore(); @@ -1646,6 +1648,28 @@ export function Settings() { setShowPromptModal(false)} /> )} + + { + const ttsConfig = { ...config.ttsConfig }; + updater(ttsConfig); + config.update((config) => (config.ttsConfig = ttsConfig)); + }} + /> + + + + { + const sttConfig = { ...config.sttConfig }; + updater(sttConfig); + config.update((config) => (config.sttConfig = sttConfig)); + }} + /> + + diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx new file mode 100644 index 00000000000..f83d280305f --- /dev/null +++ b/app/components/stt-config.tsx @@ -0,0 +1,51 @@ +import { STTConfig, STTConfigValidator } from "../store"; + +import Locale from "../locales"; +import { ListItem, Select } from "./ui-lib"; +import { DEFAULT_STT_ENGINES } from "../constant"; +import { isFirefox } from "../utils"; + +export function STTConfigList(props: { + sttConfig: STTConfig; + updateConfig: (updater: (config: STTConfig) => void) => void; +}) { + return ( + <> + + + props.updateConfig( + (config) => (config.enable = e.currentTarget.checked), + ) + } + > + + {!isFirefox() && ( + + + + )} + + ); +} diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss new file mode 100644 index 00000000000..ba9f382e40b --- /dev/null +++ b/app/components/stt.module.scss @@ -0,0 +1,119 @@ +@import "../styles/animation.scss"; +.plugin-page { + height: 100%; + display: flex; + flex-direction: column; + + .plugin-page-body { + padding: 20px; + overflow-y: auto; + + .plugin-filter { + width: 100%; + max-width: 100%; + margin-bottom: 20px; + animation: slide-in ease 0.3s; + height: 40px; + + display: flex; + + .search-bar { + flex-grow: 1; + max-width: 100%; + min-width: 0; + outline: none; + } + + .search-bar:focus { + border: 1px solid var(--primary); + } + + .plugin-filter-lang { + height: 100%; + margin-left: 10px; + } + + .plugin-create { + height: 100%; + margin-left: 10px; + box-sizing: border-box; + min-width: 80px; + } + } + + .plugin-item { + display: flex; + justify-content: space-between; + padding: 20px; + border: var(--border-in-light); + animation: slide-in ease 0.3s; + + &:not(:last-child) { + border-bottom: 0; + } + + &:first-child { + border-top-left-radius: 10px; + border-top-right-radius: 10px; + } + + &:last-child { + border-bottom-left-radius: 10px; + border-bottom-right-radius: 10px; + } + + .plugin-header { + display: flex; + align-items: center; + + .plugin-icon { + display: flex; + align-items: center; + justify-content: center; + margin-right: 10px; + } + + .plugin-title { + .plugin-name { + font-size: 14px; + font-weight: bold; + } + .plugin-info { + font-size: 12px; + } + .plugin-runtime-warning { + font-size: 12px; + color: #f86c6c; + } + } + } + + .plugin-actions { + display: flex; + flex-wrap: nowrap; + transition: all ease 0.3s; + justify-content: center; + align-items: center; + } + + @media screen and (max-width: 600px) { + display: flex; + flex-direction: column; + padding-bottom: 10px; + border-radius: 10px; + margin-bottom: 20px; + box-shadow: var(--card-shadow); + + &:not(:last-child) { + border-bottom: var(--border-in-light); + } + + .plugin-actions { + width: 100%; + justify-content: space-between; + padding-top: 10px; + } + } + } + } +} diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx new file mode 100644 index 00000000000..f86e3bc520a --- /dev/null +++ b/app/components/tts-config.tsx @@ -0,0 +1,132 @@ +import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store"; + +import Locale from "../locales"; +import { ListItem, Select } from "./ui-lib"; +import { + DEFAULT_TTS_ENGINE, + DEFAULT_TTS_ENGINES, + DEFAULT_TTS_MODELS, + DEFAULT_TTS_VOICES, +} from "../constant"; +import { InputRange } from "./input-range"; + +export function TTSConfigList(props: { + ttsConfig: TTSConfig; + updateConfig: (updater: (config: TTSConfig) => void) => void; +}) { + return ( + <> + + + props.updateConfig( + (config) => (config.enable = e.currentTarget.checked), + ) + } + > + + {/* + + props.updateConfig( + (config) => (config.autoplay = e.currentTarget.checked), + ) + } + > + */} + + + + {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && ( + <> + + + + + + + + { + props.updateConfig( + (config) => + (config.speed = TTSConfigValidator.speed( + e.currentTarget.valueAsNumber, + )), + ); + }} + > + + + )} + + ); +} diff --git a/app/components/tts.module.scss b/app/components/tts.module.scss new file mode 100644 index 00000000000..ba9f382e40b --- /dev/null +++ b/app/components/tts.module.scss @@ -0,0 +1,119 @@ +@import "../styles/animation.scss"; +.plugin-page { + height: 100%; + display: flex; + flex-direction: column; + + .plugin-page-body { + padding: 20px; + overflow-y: auto; + + .plugin-filter { + width: 100%; + max-width: 100%; + margin-bottom: 20px; + animation: slide-in ease 0.3s; + height: 40px; + + display: flex; + + .search-bar { + flex-grow: 1; + max-width: 100%; + min-width: 0; + outline: none; + } + + .search-bar:focus { + border: 1px solid var(--primary); + } + + .plugin-filter-lang { + height: 100%; + margin-left: 10px; + } + + .plugin-create { + height: 100%; + margin-left: 10px; + box-sizing: border-box; + min-width: 80px; + } + } + + .plugin-item { + display: flex; + justify-content: space-between; + padding: 20px; + border: var(--border-in-light); + animation: slide-in ease 0.3s; + + &:not(:last-child) { + border-bottom: 0; + } + + &:first-child { + border-top-left-radius: 10px; + border-top-right-radius: 10px; + } + + &:last-child { + border-bottom-left-radius: 10px; + border-bottom-right-radius: 10px; + } + + .plugin-header { + display: flex; + align-items: center; + + .plugin-icon { + display: flex; + align-items: center; + justify-content: center; + margin-right: 10px; + } + + .plugin-title { + .plugin-name { + font-size: 14px; + font-weight: bold; + } + .plugin-info { + font-size: 12px; + } + .plugin-runtime-warning { + font-size: 12px; + color: #f86c6c; + } + } + } + + .plugin-actions { + display: flex; + flex-wrap: nowrap; + transition: all ease 0.3s; + justify-content: center; + align-items: center; + } + + @media screen and (max-width: 600px) { + display: flex; + flex-direction: column; + padding-bottom: 10px; + border-radius: 10px; + margin-bottom: 20px; + box-shadow: var(--card-shadow); + + &:not(:last-child) { + border-bottom: var(--border-in-light); + } + + .plugin-actions { + width: 100%; + justify-content: space-between; + padding-top: 10px; + } + } + } + } +} diff --git a/app/constant.ts b/app/constant.ts index e88d497ca94..ec0445d2e0e 100644 --- a/app/constant.ts +++ b/app/constant.ts @@ -153,6 +153,8 @@ export const Anthropic = { export const OpenaiPath = { ChatPath: "v1/chat/completions", + SpeechPath: "v1/audio/speech", + TranscriptionPath: "v1/audio/transcriptions", ImagePath: "v1/images/generations", UsagePath: "dashboard/billing/usage", SubsPath: "dashboard/billing/subscription", @@ -256,6 +258,24 @@ export const KnowledgeCutOffDate: Record = { "gemini-pro-vision": "2023-12", }; +export const DEFAULT_TTS_ENGINE = "OpenAI-TTS"; +export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"]; +export const DEFAULT_TTS_MODEL = "tts-1"; +export const DEFAULT_TTS_VOICE = "alloy"; +export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"]; +export const DEFAULT_TTS_VOICES = [ + "alloy", + "echo", + "fable", + "onyx", + "nova", + "shimmer", +]; + +export const DEFAULT_STT_ENGINE = "WebAPI"; +export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"]; +export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper"; + const openaiModels = [ "gpt-3.5-turbo", "gpt-3.5-turbo-1106", diff --git a/app/icons/speak-stop.svg b/app/icons/speak-stop.svg new file mode 100644 index 00000000000..926ae7bb3d6 --- /dev/null +++ b/app/icons/speak-stop.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/speak.svg b/app/icons/speak.svg new file mode 100644 index 00000000000..e02212c9a42 --- /dev/null +++ b/app/icons/speak.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/icons/voice-white.svg b/app/icons/voice-white.svg new file mode 100644 index 00000000000..0a4a0ae31cd --- /dev/null +++ b/app/icons/voice-white.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/app/locales/cn.ts b/app/locales/cn.ts index 9a3227d68a5..c6aef51402f 100644 --- a/app/locales/cn.ts +++ b/app/locales/cn.ts @@ -43,6 +43,8 @@ const cn = { Delete: "删除", Edit: "编辑", FullScreen: "全屏", + Speech: "朗读", + StopSpeech: "停止", }, Commands: { new: "新建聊天", @@ -76,6 +78,8 @@ const cn = { return inputHints + ",/ 触发补全,: 触发命令"; }, Send: "发送", + StartSpeak: "说话", + StopSpeak: "停止", Config: { Reset: "清除记忆", SaveAs: "存为面具", @@ -481,6 +485,36 @@ const cn = { Title: "频率惩罚度 (frequency_penalty)", SubTitle: "值越大,越有可能降低重复字词", }, + TTS: { + Enable: { + Title: "启用文本转语音", + SubTitle: "启用文本生成语音服务", + }, + Autoplay: { + Title: "启用自动朗读", + SubTitle: "自动生成语音并播放,需先开启文本转语音开关", + }, + Model: "模型", + Engine: "转换引擎", + Voice: { + Title: "声音", + SubTitle: "生成语音时使用的声音", + }, + Speed: { + Title: "速度", + SubTitle: "生成语音的速度", + }, + }, + STT: { + Enable: { + Title: "启用语音转文本", + SubTitle: "启用语音转文本", + }, + Engine: { + Title: "转换引擎", + SubTitle: "音频转换引擎", + }, + }, }, Store: { DefaultTopic: "新的聊天", diff --git a/app/locales/en.ts b/app/locales/en.ts index 77f3a700ae1..1aa2137ec8d 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -45,6 +45,8 @@ const en: LocaleType = { Delete: "Delete", Edit: "Edit", FullScreen: "FullScreen", + Speech: "Play", + StopSpeech: "Stop", }, Commands: { new: "Start a new chat", diff --git a/app/locales/index.ts b/app/locales/index.ts index acdb3e878a1..3078afc7b54 100644 --- a/app/locales/index.ts +++ b/app/locales/index.ts @@ -137,3 +137,34 @@ export function getISOLang() { const lang = getLang(); return isoLangString[lang] ?? lang; } + +const DEFAULT_STT_LANG = "zh-CN"; +export const STT_LANG_MAP: Record = { + cn: "zh-CN", + en: "en-US", + pt: "pt-BR", + tw: "zh-TW", + jp: "ja-JP", + ko: "ko-KR", + id: "id-ID", + fr: "fr-FR", + es: "es-ES", + it: "it-IT", + tr: "tr-TR", + de: "de-DE", + vi: "vi-VN", + ru: "ru-RU", + cs: "cs-CZ", + no: "no-NO", + ar: "ar-SA", + bn: "bn-BD", + sk: "sk-SK", +}; + +export function getSTTLang(): string { + try { + return STT_LANG_MAP[getLang()]; + } catch { + return DEFAULT_STT_LANG; + } +} diff --git a/app/store/access.ts b/app/store/access.ts index a1014610e39..0e392e1e92e 100644 --- a/app/store/access.ts +++ b/app/store/access.ts @@ -120,6 +120,9 @@ const DEFAULT_ACCESS_STATE = { disableFastLink: false, customModels: "", defaultModel: "", + + // tts config + edgeTTSVoiceName: "zh-CN-YunxiNeural", }; export const useAccessStore = createPersistStore( @@ -132,6 +135,12 @@ export const useAccessStore = createPersistStore( return get().needCode; }, + edgeVoiceName() { + this.fetch(); + + return get().edgeTTSVoiceName; + }, + isValidOpenAI() { return ensure(get(), ["openaiApiKey"]); }, diff --git a/app/store/config.ts b/app/store/config.ts index e8e3c9863ef..e2de06c9ad8 100644 --- a/app/store/config.ts +++ b/app/store/config.ts @@ -5,12 +5,25 @@ import { DEFAULT_INPUT_TEMPLATE, DEFAULT_MODELS, DEFAULT_SIDEBAR_WIDTH, + DEFAULT_STT_ENGINE, + DEFAULT_STT_ENGINES, + DEFAULT_TTS_ENGINE, + DEFAULT_TTS_ENGINES, + DEFAULT_TTS_MODEL, + DEFAULT_TTS_MODELS, + DEFAULT_TTS_VOICE, + DEFAULT_TTS_VOICES, StoreKey, ServiceProvider, } from "../constant"; import { createPersistStore } from "../utils/store"; export type ModelType = (typeof DEFAULT_MODELS)[number]["name"]; +export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number]; +export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number]; +export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number]; + +export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number]; export enum SubmitKey { Enter = "Enter", @@ -66,11 +79,26 @@ export const DEFAULT_CONFIG = { quality: "standard" as DalleQuality, style: "vivid" as DalleStyle, }, + + ttsConfig: { + enable: false, + autoplay: false, + engine: DEFAULT_TTS_ENGINE, + model: DEFAULT_TTS_MODEL, + voice: DEFAULT_TTS_VOICE, + speed: 1.0, + }, + sttConfig: { + enable: false, + engine: DEFAULT_STT_ENGINE, + }, }; export type ChatConfig = typeof DEFAULT_CONFIG; export type ModelConfig = ChatConfig["modelConfig"]; +export type TTSConfig = ChatConfig["ttsConfig"]; +export type STTConfig = ChatConfig["sttConfig"]; export function limitNumber( x: number, @@ -85,6 +113,27 @@ export function limitNumber( return Math.min(max, Math.max(min, x)); } +export const TTSConfigValidator = { + engine(x: string) { + return x as TTSEngineType; + }, + model(x: string) { + return x as TTSModelType; + }, + voice(x: string) { + return x as TTSVoiceType; + }, + speed(x: number) { + return limitNumber(x, 0.25, 4.0, 1.0); + }, +}; + +export const STTConfigValidator = { + engine(x: string) { + return x as STTEngineType; + }, +}; + export const ModalConfigValidator = { model(x: string) { return x as ModelType; diff --git a/app/utils/audio.ts b/app/utils/audio.ts new file mode 100644 index 00000000000..f6828c7aac4 --- /dev/null +++ b/app/utils/audio.ts @@ -0,0 +1,45 @@ +type TTSPlayer = { + init: () => void; + play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise; + stop: () => void; +}; + +export function createTTSPlayer(): TTSPlayer { + let audioContext: AudioContext | null = null; + let audioBufferSourceNode: AudioBufferSourceNode | null = null; + + const init = () => { + audioContext = new (window.AudioContext || window.webkitAudioContext)(); + audioContext.suspend(); + }; + + const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => { + if (audioBufferSourceNode) { + audioBufferSourceNode.stop(); + audioBufferSourceNode.disconnect(); + } + + const buffer = await audioContext!.decodeAudioData(audioBuffer); + audioBufferSourceNode = audioContext!.createBufferSource(); + audioBufferSourceNode.buffer = buffer; + audioBufferSourceNode.connect(audioContext!.destination); + audioContext!.resume().then(() => { + audioBufferSourceNode!.start(); + }); + audioBufferSourceNode.onended = onended; + }; + + const stop = () => { + if (audioBufferSourceNode) { + audioBufferSourceNode.stop(); + audioBufferSourceNode.disconnect(); + audioBufferSourceNode = null; + } + if (audioContext) { + audioContext.close(); + audioContext = null; + } + }; + + return { init, play, stop }; +} diff --git a/app/utils/ms_edge_tts.ts b/app/utils/ms_edge_tts.ts new file mode 100644 index 00000000000..f291ebada93 --- /dev/null +++ b/app/utils/ms_edge_tts.ts @@ -0,0 +1,391 @@ +// import axios from "axios"; +import { Buffer } from "buffer"; +import { randomBytes } from "crypto"; +import { Readable } from "stream"; + +// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume + */ +export enum VOLUME { + SILENT = "silent", + X_SOFT = "x-soft", + SOFT = "soft", + MEDIUM = "medium", + LOUD = "loud", + X_LOUD = "x-LOUD", + DEFAULT = "default", +} + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking + */ +export enum RATE { + X_SLOW = "x-slow", + SLOW = "slow", + MEDIUM = "medium", + FAST = "fast", + X_FAST = "x-fast", + DEFAULT = "default", +} + +/** + * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline + */ +export enum PITCH { + X_LOW = "x-low", + LOW = "low", + MEDIUM = "medium", + HIGH = "high", + X_HIGH = "x-high", + DEFAULT = "default", +} + +/** + * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted. + */ +export enum OUTPUT_FORMAT { + // Streaming ============================= + // AMR_WB_16000HZ = "amr-wb-16000hz", + // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus", + // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3", + // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3", + // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3", + // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus", + // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus", + AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3", + AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3", + // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3", + // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3", + // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3", + // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus", + // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus", + // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus", + // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw", + // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw", + // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm", + // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm", + // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk", + // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm", + // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm", + // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk", + // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm", + // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm", + // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus", + // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus", + WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus", + // Non-streaming ============================= + // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw", + // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw", + // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm", + // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm", + // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm", + // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm", + // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm", +} + +export type Voice = { + Name: string; + ShortName: string; + Gender: string; + Locale: string; + SuggestedCodec: string; + FriendlyName: string; + Status: string; +}; + +export class ProsodyOptions { + /** + * The pitch to use. + * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline) + */ + pitch?: PITCH | string = "+0Hz"; + /** + * The rate to use. + * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking) + */ + rate?: RATE | string | number = 1.0; + /** + * The volume to use. + * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%). + * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume) + */ + volume?: VOLUME | string | number = 100.0; +} + +export class MsEdgeTTS { + static OUTPUT_FORMAT = OUTPUT_FORMAT; + private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; + private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; + private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`; + private static BINARY_DELIM = "Path:audio\r\n"; + private static VOICE_LANG_REGEX = /\w{2}-\w{2}/; + private readonly _enableLogger; + private _ws: WebSocket | undefined; + private _voice: any; + private _voiceLocale: any; + private _outputFormat: any; + private _streams: { [key: string]: Readable } = {}; + private _startTime = 0; + + private _log(...o: any[]) { + if (this._enableLogger) { + console.log(...o); + } + } + + /** + * Create a new `MsEdgeTTS` instance. + * + * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent). + * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console + */ + public constructor(enableLogger: boolean = false) { + this._enableLogger = enableLogger; + } + + private async _send(message: any) { + for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) { + if (i == 1) { + this._startTime = Date.now(); + } + this._log("connecting: ", i); + await this._initClient(); + } + this._ws!.send(message); + } + + private _initClient() { + this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL); + + this._ws.binaryType = "arraybuffer"; + return new Promise((resolve, reject) => { + this._ws!.onopen = () => { + this._log( + "Connected in", + (Date.now() - this._startTime) / 1000, + "seconds", + ); + this._send( + `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n + { + "context": { + "synthesis": { + "audio": { + "metadataoptions": { + "sentenceBoundaryEnabled": "false", + "wordBoundaryEnabled": "false" + }, + "outputFormat": "${this._outputFormat}" + } + } + } + } + `, + ).then(resolve); + }; + this._ws!.onmessage = (m: any) => { + const buffer = Buffer.from(m.data as ArrayBuffer); + const message = buffer.toString(); + const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1]; + if (message.includes("Path:turn.start")) { + // start of turn, ignore + } else if (message.includes("Path:turn.end")) { + // end of turn, close stream + this._streams[requestId].push(null); + } else if (message.includes("Path:response")) { + // context response, ignore + } else if ( + message.includes("Path:audio") && + m.data instanceof ArrayBuffer + ) { + this._pushAudioData(buffer, requestId); + } else { + this._log("UNKNOWN MESSAGE", message); + } + }; + this._ws!.onclose = () => { + this._log( + "disconnected after:", + (Date.now() - this._startTime) / 1000, + "seconds", + ); + for (const requestId in this._streams) { + this._streams[requestId].push(null); + } + }; + this._ws!.onerror = function (error: any) { + reject("Connect Error: " + error); + }; + }); + } + + private _pushAudioData(audioBuffer: Buffer, requestId: string) { + const audioStartIndex = + audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + + MsEdgeTTS.BINARY_DELIM.length; + const audioData = audioBuffer.subarray(audioStartIndex); + this._streams[requestId].push(audioData); + this._log("received audio chunk, size: ", audioData?.length); + } + + private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string { + // in case future updates to the edge API block these elements, we'll be concatenating strings. + options = { ...new ProsodyOptions(), ...options }; + return ` + + + ${input} + + + `; + } + + /** + * Fetch the list of voices available in Microsoft Edge. + * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview). + */ + // getVoices(): Promise { + // return new Promise((resolve, reject) => { + // axios + // .get(MsEdgeTTS.VOICES_URL) + // .then((res) => resolve(res.data)) + // .catch(reject); + // }); + // } + getVoices(): Promise { + return fetch(MsEdgeTTS.VOICES_URL) + .then((response) => { + if (!response.ok) { + throw new Error("Network response was not ok"); + } + return response.json(); + }) + .then((data) => data as Voice[]) + .catch((error) => { + throw error; + }); + } + + /** + * Sets the required information for the speech to be synthesised and inits a new WebSocket connection. + * Must be called at least once before text can be synthesised. + * Saved in this instance. Can be called at any time times to update the metadata. + * + * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices) + * @param outputFormat any {@link OUTPUT_FORMAT} + * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName` + */ + async setMetadata( + voiceName: string, + outputFormat: OUTPUT_FORMAT, + voiceLocale?: string, + ) { + const oldVoice = this._voice; + const oldVoiceLocale = this._voiceLocale; + const oldOutputFormat = this._outputFormat; + + this._voice = voiceName; + this._voiceLocale = voiceLocale; + if (!this._voiceLocale) { + const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice); + if (!voiceLangMatch) + throw new Error("Could not infer voiceLocale from voiceName!"); + this._voiceLocale = voiceLangMatch[0]; + } + this._outputFormat = outputFormat; + + const changed = + oldVoice !== this._voice || + oldVoiceLocale !== this._voiceLocale || + oldOutputFormat !== this._outputFormat; + + // create new client + if (changed || this._ws!.readyState !== this._ws!.OPEN) { + this._startTime = Date.now(); + await this._initClient(); + } + } + + private _metadataCheck() { + if (!this._ws) + throw new Error( + "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.", + ); + } + + /** + * Close the WebSocket connection. + */ + close() { + this._ws!.close(); + } + + /** + * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}. + * + * @param input the text to synthesise. Can include SSML elements. + * @param options (optional) {@link ProsodyOptions} + * @returns {Readable} - a `stream.Readable` with the audio data + */ + toStream(input: string, options?: ProsodyOptions): Readable { + const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options)); + return stream; + } + + toArrayBuffer(input: string, options?: ProsodyOptions): Promise { + return new Promise((resolve, reject) => { + let data: Uint8Array[] = []; + const readable = this.toStream(input, options); + readable.on("data", (chunk) => { + data.push(chunk); + }); + + readable.on("end", () => { + resolve(Buffer.concat(data).buffer); + }); + + readable.on("error", (err) => { + reject(err); + }); + }); + } + + /** + * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request. + * + * @param requestSSML the SSML to send. SSML elements required in order to work. + * @returns {Readable} - a `stream.Readable` with the audio data + */ + rawToStream(requestSSML: string): Readable { + const { stream } = this._rawSSMLRequest(requestSSML); + return stream; + } + + private _rawSSMLRequest(requestSSML: string): { + stream: Readable; + requestId: string; + } { + this._metadataCheck(); + + const requestId = randomBytes(16).toString("hex"); + const request = + `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n + ` + requestSSML.trim(); + // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup + const self = this; + const stream = new Readable({ + read() {}, + destroy(error: Error | null, callback: (error: Error | null) => void) { + delete self._streams[requestId]; + callback(error); + }, + }); + this._streams[requestId] = stream; + this._send(request).then(); + return { stream, requestId }; + } +} diff --git a/app/utils/speech.ts b/app/utils/speech.ts new file mode 100644 index 00000000000..dc8102879fb --- /dev/null +++ b/app/utils/speech.ts @@ -0,0 +1,126 @@ +import { ChatGPTApi } from "../client/platforms/openai"; +import { getSTTLang } from "../locales"; +import { isFirefox } from "../utils"; + +export type TranscriptionCallback = (transcription: string) => void; + +export abstract class SpeechApi { + protected onTranscription: TranscriptionCallback = () => {}; + + abstract isListening(): boolean; + abstract start(): Promise; + abstract stop(): Promise; + + onTranscriptionReceived(callback: TranscriptionCallback) { + this.onTranscription = callback; + } +} + +export class OpenAITranscriptionApi extends SpeechApi { + private listeningStatus = false; + private mediaRecorder: MediaRecorder | null = null; + private stream: MediaStream | null = null; + private audioChunks: Blob[] = []; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback) { + super(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + } + + async start(): Promise { + // @ts-ignore + navigator.getUserMedia = + // @ts-ignore + navigator.getUserMedia || + // @ts-ignore + navigator.webkitGetUserMedia || + // @ts-ignore + navigator.mozGetUserMedia || + // @ts-ignore + navigator.msGetUserMedia; + if (navigator.mediaDevices) { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + this.mediaRecorder = new MediaRecorder(stream); + this.mediaRecorder.ondataavailable = (e) => { + if (e.data && e.data.size > 0) { + this.audioChunks.push(e.data); + } + }; + + this.stream = stream; + } else { + console.warn("Media Decives will work only with SSL"); + return; + } + + this.audioChunks = []; + + // this.recorder.addEventListener("dataavailable", (event) => { + // this.audioChunks.push(event.data); + // }); + + this.mediaRecorder.start(1000); + this.listeningStatus = true; + } + + async stop(): Promise { + if (!this.mediaRecorder || !this.listeningStatus) { + return; + } + + return new Promise((resolve) => { + this.mediaRecorder!.addEventListener("stop", async () => { + const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" }); + const llm = new ChatGPTApi(); + const transcription = await llm.transcription({ file: audioBlob }); + this.onTranscription(transcription); + this.listeningStatus = false; + resolve(); + }); + + this.mediaRecorder!.stop(); + }); + } +} + +export class WebTranscriptionApi extends SpeechApi { + private listeningStatus = false; + private recognitionInstance: any | null = null; + + isListening = () => this.listeningStatus; + + constructor(transcriptionCallback?: TranscriptionCallback) { + super(); + if (isFirefox()) return; + const SpeechRecognition = + (window as any).SpeechRecognition || + (window as any).webkitSpeechRecognition; + this.recognitionInstance = new SpeechRecognition(); + this.recognitionInstance.continuous = true; + this.recognitionInstance.interimResults = true; + this.recognitionInstance.lang = getSTTLang(); + if (transcriptionCallback) { + this.onTranscriptionReceived(transcriptionCallback); + } + this.recognitionInstance.onresult = (event: any) => { + const result = event.results[event.results.length - 1]; + if (result.isFinal) { + this.onTranscription(result[0].transcript); + } + }; + } + + async start(): Promise { + this.listeningStatus = true; + await this.recognitionInstance.start(); + } + + async stop(): Promise { + this.listeningStatus = false; + await this.recognitionInstance.stop(); + } +} diff --git a/package.json b/package.json index eb0a5ef6735..02d36ae3167 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "html-to-image": "^1.11.11", "lodash-es": "^4.17.21", "mermaid": "^10.6.1", + "markdown-to-txt": "^2.0.1", "nanoid": "^5.0.3", "next": "^14.1.1", "node-fetch": "^3.3.1", @@ -73,4 +74,4 @@ "lint-staged/yaml": "^2.2.2" }, "packageManager": "yarn@1.22.19" -} +} \ No newline at end of file diff --git a/yarn.lock b/yarn.lock index 793c845d722..3b76a49e780 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4378,11 +4378,21 @@ lodash.debounce@^4.0.8: resolved "https://registry.yarnpkg.com/lodash.debounce/-/lodash.debounce-4.0.8.tgz#82d79bff30a67c4005ffd5e2515300ad9ca4d7af" integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow== +lodash.escape@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98" + integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw== + lodash.merge@^4.6.2: version "4.6.2" resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a" integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ== +lodash.unescape@^4.0.1: + version "4.0.1" + resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c" + integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg== + lodash@^4.17.21: version "4.17.21" resolved "https://registry.npmmirror.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c" @@ -4438,6 +4448,20 @@ markdown-table@^3.0.0: resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd" integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw== +markdown-to-txt@^2.0.1: + version "2.0.1" + resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5" + integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw== + dependencies: + lodash.escape "^4.0.1" + lodash.unescape "^4.0.1" + marked "^4.0.14" + +marked@^4.0.14: + version "4.3.0" + resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3" + integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A== + mdast-util-definitions@^5.0.0: version "5.1.2" resolved "https://registry.yarnpkg.com/mdast-util-definitions/-/mdast-util-definitions-5.1.2.tgz#9910abb60ac5d7115d6819b57ae0bcef07a3f7a7" From 93f1762e6c85e2a71a70534dc8a84b322d3643e7 Mon Sep 17 00:00:00 2001 From: DDMeaqua Date: Tue, 27 Aug 2024 17:02:44 +0800 Subject: [PATCH 2/7] chore: wip --- app/client/platforms/alibaba.ts | 7 +++++++ app/client/platforms/anthropic.ts | 7 +++++++ app/client/platforms/baidu.ts | 7 +++++++ app/client/platforms/bytedance.ts | 7 +++++++ app/client/platforms/google.ts | 6 ++++++ app/client/platforms/iflytek.ts | 7 +++++++ app/client/platforms/moonshot.ts | 7 +++++++ app/client/platforms/tencent.ts | 7 +++++++ 8 files changed, 55 insertions(+) diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index d5fa3042fc1..477ef193fdc 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -83,6 +83,13 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts index b079ba1ada2..df4dc7f3830 100644 --- a/app/client/platforms/anthropic.ts +++ b/app/client/platforms/anthropic.ts @@ -73,6 +73,13 @@ const ClaudeMapper = { const keys = ["claude-2, claude-instant-1"]; export class ClaudeApi implements LLMApi { + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + extractMessage(res: any) { console.log("[Response] claude response: ", res); diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts index 3be147f4985..2b3119c2a2c 100644 --- a/app/client/platforms/baidu.ts +++ b/app/client/platforms/baidu.ts @@ -75,6 +75,13 @@ export class ErnieApi implements LLMApi { return [baseUrl, path].join("/"); } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function", diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts index 7677cafe12b..31c0be3d33b 100644 --- a/app/client/platforms/bytedance.ts +++ b/app/client/platforms/bytedance.ts @@ -77,6 +77,13 @@ export class DoubaoApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages = options.messages.map((v) => ({ role: v.role, diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index 12d8846357a..6c6c3b25e26 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -56,6 +56,12 @@ export class GeminiProApi implements LLMApi { "" ); } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } async chat(options: ChatOptions): Promise { const apiClient = this; let multimodal = false; diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts index 73cea5ba0e7..77a4571e124 100644 --- a/app/client/platforms/iflytek.ts +++ b/app/client/platforms/iflytek.ts @@ -53,6 +53,13 @@ export class SparkApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts index 7d257ccb2e6..22bbaf01f46 100644 --- a/app/client/platforms/moonshot.ts +++ b/app/client/platforms/moonshot.ts @@ -66,6 +66,13 @@ export class MoonshotApi implements LLMApi { return res.choices?.at(0)?.message?.content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const messages: ChatOptions["messages"] = []; for (const v of options.messages) { diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts index 579008a9b9d..5eb48791b01 100644 --- a/app/client/platforms/tencent.ts +++ b/app/client/platforms/tencent.ts @@ -89,6 +89,13 @@ export class HunyuanApi implements LLMApi { return res.Choices?.at(0)?.Message?.Content ?? ""; } + speech(options: SpeechOptions): Promise { + throw new Error("Method not implemented."); + } + transcription(options: TranscriptionOptions): Promise { + throw new Error("Method not implemented."); + } + async chat(options: ChatOptions) { const visionModel = isVisionModel(options.config.model); const messages = options.messages.map((v, index) => ({ From f86b220c922a9209e99e2a3647e97ab72f47de3d Mon Sep 17 00:00:00 2001 From: DDMeaqua Date: Tue, 27 Aug 2024 19:50:16 +0800 Subject: [PATCH 3/7] feat: add voice action --- app/components/chat.tsx | 113 ++++++++++++++++++-------------------- app/icons/voice-white.svg | 6 +- 2 files changed, 55 insertions(+), 64 deletions(-) diff --git a/app/components/chat.tsx b/app/components/chat.tsx index e5391ad226c..624b7618e21 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -453,6 +453,7 @@ export function ChatActions(props: { showPromptHints: () => void; hitBottom: boolean; uploading: boolean; + setUserInput: (input: string) => void; }) { const config = useAppConfig(); const navigate = useNavigate(); @@ -544,6 +545,44 @@ export function ChatActions(props: { } }, [chatStore, currentModel, models]); + const [isListening, setIsListening] = useState(false); + const [isTranscription, setIsTranscription] = useState(false); + const [speechApi, setSpeechApi] = useState(null); + + useEffect(() => { + if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE; + setSpeechApi( + config.sttConfig.engine === DEFAULT_STT_ENGINE + ? new WebTranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ) + : new OpenAITranscriptionApi((transcription) => + onRecognitionEnd(transcription), + ), + ); + }, []); + + const startListening = async () => { + if (speechApi) { + await speechApi.start(); + setIsListening(true); + } + }; + const stopListening = async () => { + if (speechApi) { + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(true); + await speechApi.stop(); + setIsListening(false); + } + }; + const onRecognitionEnd = (finalTranscript: string) => { + console.log(finalTranscript); + if (finalTranscript) props.setUserInput(finalTranscript); + if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) + setIsTranscription(false); + }; + return (
{couldStop && ( @@ -768,6 +807,16 @@ export function ChatActions(props: { }} /> )} + + {config.sttConfig.enable && ( + + isListening ? await stopListening() : await startListening() + } + text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak} + icon={} + /> + )}
); } @@ -940,33 +989,6 @@ function _Chat() { } }; - const [isListening, setIsListening] = useState(false); - const [isTranscription, setIsTranscription] = useState(false); - const [speechApi, setSpeechApi] = useState(null); - - const startListening = async () => { - if (speechApi) { - await speechApi.start(); - setIsListening(true); - } - }; - - const stopListening = async () => { - if (speechApi) { - if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) - setIsTranscription(true); - await speechApi.stop(); - setIsListening(false); - } - }; - - const onRecognitionEnd = (finalTranscript: string) => { - console.log(finalTranscript); - if (finalTranscript) setUserInput(finalTranscript); - if (config.sttConfig.engine !== DEFAULT_STT_ENGINE) - setIsTranscription(false); - }; - const doSubmit = (userInput: string) => { if (userInput.trim() === "") return; const matchCommand = chatCommands.match(userInput); @@ -1037,16 +1059,6 @@ function _Chat() { } }); // eslint-disable-next-line react-hooks/exhaustive-deps - if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE; - setSpeechApi( - config.sttConfig.engine === DEFAULT_STT_ENGINE - ? new WebTranscriptionApi((transcription) => - onRecognitionEnd(transcription), - ) - : new OpenAITranscriptionApi((transcription) => - onRecognitionEnd(transcription), - ), - ); }, []); // check if should send message @@ -1784,6 +1796,7 @@ function _Chat() { setUserInput("/"); onSearch(""); }} + setUserInput={setUserInput} /> diff --git a/app/icons/voice-white.svg b/app/icons/voice-white.svg index 0a4a0ae31cd..e7d5cbcc86f 100644 --- a/app/icons/voice-white.svg +++ b/app/icons/voice-white.svg @@ -1,4 +1,4 @@ - + @@ -7,9 +7,9 @@ - + - + From e9f90a4d82edbb446aedaef7ae27984d21b870d4 Mon Sep 17 00:00:00 2001 From: Meaqua Date: Tue, 27 Aug 2024 21:49:00 +0800 Subject: [PATCH 4/7] fix: i18n --- app/locales/en.ts | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/app/locales/en.ts b/app/locales/en.ts index 1aa2137ec8d..ae20a0d4f89 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -490,6 +490,37 @@ const en: LocaleType = { SubTitle: "A larger value decreasing the likelihood to repeat the same line", }, + TTS: { + Enable: { + Title: "Enable TTS", + SubTitle: "Enable text-to-speech service", + }, + Autoplay: { + Title: "Enable Autoplay", + SubTitle: + "Automatically generate speech and play, you need to enable the text-to-speech switch first", + }, + Model: "Model", + Voice: { + Title: "Voice", + SubTitle: "The voice to use when generating the audio", + }, + Speed: { + Title: "Speed", + SubTitle: "The speed of the generated audio", + }, + Engine: "TTS Engine", + }, + STT: { + Enable: { + Title: "Enable STT", + SubTitle: "Enable Speech-to-Text", + }, + Engine: { + Title: "STT Engine", + SubTitle: "Text-to-Speech Engine", + }, + }, }, Store: { DefaultTopic: "New Conversation", From ed5aea0521797841981919fa3c1ebb6340c35168 Mon Sep 17 00:00:00 2001 From: DDMeaqua Date: Wed, 28 Aug 2024 12:37:19 +0800 Subject: [PATCH 5/7] fix: bug --- app/client/platforms/alibaba.ts | 2 ++ app/client/platforms/anthropic.ts | 9 ++++++++- app/client/platforms/baidu.ts | 2 ++ app/client/platforms/bytedance.ts | 2 ++ app/client/platforms/google.ts | 10 +++++++++- app/client/platforms/iflytek.ts | 9 ++++++++- app/client/platforms/moonshot.ts | 2 ++ app/client/platforms/tencent.ts | 2 ++ 8 files changed, 35 insertions(+), 3 deletions(-) diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 477ef193fdc..e839c69f01f 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -12,6 +12,8 @@ import { getHeaders, LLMApi, LLMModel, + SpeechOptions, + TranscriptionOptions, MultimodalContent, } from "../api"; import Locale from "../../locales"; diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts index df4dc7f3830..f0f95f0fd98 100644 --- a/app/client/platforms/anthropic.ts +++ b/app/client/platforms/anthropic.ts @@ -1,5 +1,12 @@ import { ACCESS_CODE_PREFIX, Anthropic, ApiPath } from "@/app/constant"; -import { ChatOptions, getHeaders, LLMApi, MultimodalContent } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + MultimodalContent, + SpeechOptions, + TranscriptionOptions, +} from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import { getClientConfig } from "@/app/config/client"; import { DEFAULT_API_HOST } from "@/app/constant"; diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts index 2b3119c2a2c..0c2be5fb14b 100644 --- a/app/client/platforms/baidu.ts +++ b/app/client/platforms/baidu.ts @@ -14,6 +14,8 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts index 31c0be3d33b..5a0c9b8b12e 100644 --- a/app/client/platforms/bytedance.ts +++ b/app/client/platforms/bytedance.ts @@ -13,6 +13,8 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts index 6c6c3b25e26..c8d3658b350 100644 --- a/app/client/platforms/google.ts +++ b/app/client/platforms/google.ts @@ -1,5 +1,13 @@ import { ApiPath, Google, REQUEST_TIMEOUT_MS } from "@/app/constant"; -import { ChatOptions, getHeaders, LLMApi, LLMModel, LLMUsage } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + LLMModel, + LLMUsage, + SpeechOptions, + TranscriptionOptions, +} from "../api"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; import { getClientConfig } from "@/app/config/client"; import { DEFAULT_API_HOST } from "@/app/constant"; diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts index 77a4571e124..6463e052e40 100644 --- a/app/client/platforms/iflytek.ts +++ b/app/client/platforms/iflytek.ts @@ -7,7 +7,14 @@ import { } from "@/app/constant"; import { useAccessStore, useAppConfig, useChatStore } from "@/app/store"; -import { ChatOptions, getHeaders, LLMApi, LLMModel } from "../api"; +import { + ChatOptions, + getHeaders, + LLMApi, + LLMModel, + SpeechOptions, + TranscriptionOptions, +} from "../api"; import Locale from "../../locales"; import { EventStreamContentType, diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts index 22bbaf01f46..b5a8aa5880d 100644 --- a/app/client/platforms/moonshot.ts +++ b/app/client/platforms/moonshot.ts @@ -20,6 +20,8 @@ import { LLMModel, LLMUsage, MultimodalContent, + SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts index 5eb48791b01..1739b7a142b 100644 --- a/app/client/platforms/tencent.ts +++ b/app/client/platforms/tencent.ts @@ -8,6 +8,8 @@ import { LLMApi, LLMModel, MultimodalContent, + SpeechOptions, + TranscriptionOptions, } from "../api"; import Locale from "../../locales"; import { From 318e0989a2c28ae323d3f00d8256a7e48169e4a6 Mon Sep 17 00:00:00 2001 From: DDMeaqua Date: Wed, 28 Aug 2024 13:13:41 +0800 Subject: [PATCH 6/7] fix: transcription headers --- app/client/api.ts | 13 ++++++++----- app/components/chat.tsx | 1 - app/components/tts-config.tsx | 3 ++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/app/client/api.ts b/app/client/api.ts index 8d0877a0d4d..7e1d0135ed6 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -220,13 +220,16 @@ export function validString(x: string): boolean { return x?.length > 0; } -export function getHeaders() { +export function getHeaders(ignoreHeaders?: boolean) { const accessStore = useAccessStore.getState(); const chatStore = useChatStore.getState(); - const headers: Record = { - "Content-Type": "application/json", - Accept: "application/json", - }; + let headers: Record = {}; + if (!ignoreHeaders) { + headers = { + "Content-Type": "application/json", + Accept: "application/json", + }; + } const clientConfig = getClientConfig(); diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 624b7618e21..f4ebd70d88d 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -1686,7 +1686,6 @@ function _Chat() { ? Locale.Chat.Actions.StopSpeech : Locale.Chat.Actions.Speech } - loding={speechLoading} icon={ speechStatus ? ( diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx index f86e3bc520a..39ae85730c2 100644 --- a/app/components/tts-config.tsx +++ b/app/components/tts-config.tsx @@ -1,4 +1,4 @@ -import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store"; +import { TTSConfig, TTSConfigValidator } from "../store"; import Locale from "../locales"; import { ListItem, Select } from "./ui-lib"; @@ -111,6 +111,7 @@ export function TTSConfigList(props: { subTitle={Locale.Settings.TTS.Speed.SubTitle} > Date: Wed, 28 Aug 2024 13:15:52 +0800 Subject: [PATCH 7/7] fix: i18n --- app/locales/en.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/app/locales/en.ts b/app/locales/en.ts index ae20a0d4f89..dd13ff99cc5 100644 --- a/app/locales/en.ts +++ b/app/locales/en.ts @@ -80,6 +80,8 @@ const en: LocaleType = { return inputHints + ", / to search prompts, : to use commands"; }, Send: "Send", + StartSpeak: "Start Speak", + StopSpeak: "Stop Speak", Config: { Reset: "Reset to Default", SaveAs: "Save as Mask",