From 2f410fc09f62e67c32ac6142e99937d3e8f29601 Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Tue, 27 Aug 2024 16:21:02 +0800
Subject: [PATCH 1/7] feat: add tts stt

---
 app/client/api.ts              |  22 ++
 app/client/platforms/openai.ts |  83 ++++++-
 app/components/chat.tsx        | 150 ++++++++++++-
 app/components/settings.tsx    |  24 ++
 app/components/stt-config.tsx  |  51 +++++
 app/components/stt.module.scss | 119 ++++++++++
 app/components/tts-config.tsx  | 132 +++++++++++
 app/components/tts.module.scss | 119 ++++++++++
 app/constant.ts                |  20 ++
 app/icons/speak-stop.svg       |   1 +
 app/icons/speak.svg            |   1 +
 app/icons/voice-white.svg      |  16 ++
 app/locales/cn.ts              |  34 +++
 app/locales/en.ts              |   2 +
 app/locales/index.ts           |  31 +++
 app/store/access.ts            |   9 +
 app/store/config.ts            |  49 +++++
 app/utils/audio.ts             |  45 ++++
 app/utils/ms_edge_tts.ts       | 391 +++++++++++++++++++++++++++++++++
 app/utils/speech.ts            | 126 +++++++++++
 package.json                   |   3 +-
 yarn.lock                      |  24 ++
 22 files changed, 1446 insertions(+), 6 deletions(-)
 create mode 100644 app/components/stt-config.tsx
 create mode 100644 app/components/stt.module.scss
 create mode 100644 app/components/tts-config.tsx
 create mode 100644 app/components/tts.module.scss
 create mode 100644 app/icons/speak-stop.svg
 create mode 100644 app/icons/speak.svg
 create mode 100644 app/icons/voice-white.svg
 create mode 100644 app/utils/audio.ts
 create mode 100644 app/utils/ms_edge_tts.ts
 create mode 100644 app/utils/speech.ts
diff --git a/app/client/api.ts b/app/client/api.ts
index d7fb023a226..8d0877a0d4d 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -20,6 +20,7 @@ export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];
 
 export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
+export const TTSModels = ["tts-1", "tts-1-hd"] as const;
 export type ChatModel = ModelType;
 
 export interface MultimodalContent {
@@ -48,6 +49,25 @@ export interface LLMConfig {
   style?: DalleRequestPayload["style"];
 }
 
+export interface SpeechOptions {
+  model: string;
+  input: string;
+  voice: string;
+  response_format?: string;
+  speed?: number;
+  onController?: (controller: AbortController) => void;
+}
+
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
   messages: RequestMessage[];
   config: LLMConfig;
@@ -80,6 +100,8 @@ export interface LLMModelProvider {
 
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }
diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
index d4e262c16b4..02115140b72 100644
--- a/app/client/platforms/openai.ts
+++ b/app/client/platforms/openai.ts
@@ -26,6 +26,8 @@ import {
   LLMModel,
   LLMUsage,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -77,7 +79,7 @@ export interface DalleRequestPayload {
 export class ChatGPTApi implements LLMApi {
   private disableListModels = true;
 
-  path(path: string): string {
+  path(path: string, model?: string): string {
     const accessStore = useAccessStore.getState();
 
     let baseUrl = "";
@@ -140,6 +142,85 @@ export class ChatGPTApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? res;
   }
 
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    const requestPayload = {
+      model: options.model,
+      input: options.input,
+      voice: options.voice,
+      response_format: options.response_format,
+      speed: options.speed,
+    };
+
+    console.log("[Request] openai speech payload: ", requestPayload);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const speechPath = this.path(OpenaiPath.SpeechPath, options.model);
+      const speechPayload = {
+        method: "POST",
+        body: JSON.stringify(requestPayload),
+        signal: controller.signal,
+        headers: getHeaders(),
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+
+      const res = await fetch(speechPath, speechPayload);
+      clearTimeout(requestTimeoutId);
+      return await res.arrayBuffer();
+    } catch (e) {
+      console.log("[Request] failed to make a speech request", e);
+      throw e;
+    }
+  }
+
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
   async chat(options: ChatOptions) {
     const modelConfig = {
       ...useAppConfig.getState().modelConfig,
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index ed5b06799c3..e5391ad226c 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -10,11 +10,14 @@ import React, {
 } from "react";
 
 import SendWhiteIcon from "../icons/send-white.svg";
+import VoiceWhiteIcon from "../icons/voice-white.svg";
 import BrainIcon from "../icons/brain.svg";
 import RenameIcon from "../icons/rename.svg";
 import ExportIcon from "../icons/share.svg";
 import ReturnIcon from "../icons/return.svg";
 import CopyIcon from "../icons/copy.svg";
+import SpeakIcon from "../icons/speak.svg";
+import SpeakStopIcon from "../icons/speak-stop.svg";
 import LoadingIcon from "../icons/three-dots.svg";
 import LoadingButtonIcon from "../icons/loading.svg";
 import PromptIcon from "../icons/prompt.svg";
@@ -64,6 +67,7 @@ import {
   getMessageImages,
   isVisionModel,
   isDalle3,
+  isFirefox,
 } from "../utils";
 
 import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
@@ -73,7 +77,7 @@ import dynamic from "next/dynamic";
 import { ChatControllerPool } from "../client/controller";
 import { DalleSize, DalleQuality, DalleStyle } from "../typing";
 import { Prompt, usePromptStore } from "../store/prompt";
-import Locale from "../locales";
+import Locale, { getLang, getSTTLang } from "../locales";
 
 import { IconButton } from "./button";
 import styles from "./chat.module.scss";
@@ -90,6 +94,10 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
   CHAT_PAGE_SIZE,
+  DEFAULT_STT_ENGINE,
+  DEFAULT_TTS_ENGINE,
+  FIREFOX_DEFAULT_STT_ENGINE,
+  ModelProvider,
   LAST_INPUT_KEY,
   Path,
   REQUEST_TIMEOUT_MS,
@@ -106,6 +114,16 @@ import { ExportMessageModal } from "./exporter";
 import { getClientConfig } from "../config/client";
 import { useAllModels } from "../utils/hooks";
 import { MultimodalContent } from "../client/api";
+import { ClientApi } from "../client/api";
+import { createTTSPlayer } from "../utils/audio";
+import {
+  OpenAITranscriptionApi,
+  SpeechApi,
+  WebTranscriptionApi,
+} from "../utils/speech";
+import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
+
+const ttsPlayer = createTTSPlayer();
 
 const Markdown = dynamic(async () => (await import("./markdown")).Markdown, {
   loading: () => <LoadingIcon />,
@@ -922,6 +940,33 @@ function _Chat() {
     }
   };
 
+  const [isListening, setIsListening] = useState(false);
+  const [isTranscription, setIsTranscription] = useState(false);
+  const [speechApi, setSpeechApi] = useState<any>(null);
+
+  const startListening = async () => {
+    if (speechApi) {
+      await speechApi.start();
+      setIsListening(true);
+    }
+  };
+
+  const stopListening = async () => {
+    if (speechApi) {
+      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+        setIsTranscription(true);
+      await speechApi.stop();
+      setIsListening(false);
+    }
+  };
+
+  const onRecognitionEnd = (finalTranscript: string) => {
+    console.log(finalTranscript);
+    if (finalTranscript) setUserInput(finalTranscript);
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
+
   const doSubmit = (userInput: string) => {
     if (userInput.trim() === "") return;
     const matchCommand = chatCommands.match(userInput);
@@ -992,6 +1037,16 @@ function _Chat() {
       }
     });
     // eslint-disable-next-line react-hooks/exhaustive-deps
+    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
+    setSpeechApi(
+      config.sttConfig.engine === DEFAULT_STT_ENGINE
+        ? new WebTranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          )
+        : new OpenAITranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          ),
+    );
   }, []);
 
   // check if should send message
@@ -1102,10 +1157,55 @@ function _Chat() {
     });
   };
 
+  const accessStore = useAccessStore();
+  const [speechStatus, setSpeechStatus] = useState(false);
+  const [speechLoading, setSpeechLoading] = useState(false);
+  async function openaiSpeech(text: string) {
+    if (speechStatus) {
+      ttsPlayer.stop();
+      setSpeechStatus(false);
+    } else {
+      var api: ClientApi;
+      api = new ClientApi(ModelProvider.GPT);
+      const config = useAppConfig.getState();
+      setSpeechLoading(true);
+      ttsPlayer.init();
+      let audioBuffer: ArrayBuffer;
+      const { markdownToTxt } = require("markdown-to-txt");
+      const textContent = markdownToTxt(text);
+      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
+        const edgeVoiceName = accessStore.edgeVoiceName();
+        const tts = new MsEdgeTTS();
+        await tts.setMetadata(
+          edgeVoiceName,
+          OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
+        );
+        audioBuffer = await tts.toArrayBuffer(textContent);
+      } else {
+        audioBuffer = await api.llm.speech({
+          model: config.ttsConfig.model,
+          input: textContent,
+          voice: config.ttsConfig.voice,
+          speed: config.ttsConfig.speed,
+        });
+      }
+      setSpeechStatus(true);
+      ttsPlayer
+        .play(audioBuffer, () => {
+          setSpeechStatus(false);
+        })
+        .catch((e) => {
+          console.error("[OpenAI Speech]", e);
+          showToast(prettyObject(e));
+          setSpeechStatus(false);
+        })
+        .finally(() => setSpeechLoading(false));
+    }
+  }
+
   const context: RenderMessage[] = useMemo(() => {
     return session.mask.hideContext ? [] : session.mask.context.slice();
   }, [session.mask.context, session.mask.hideContext]);
-  const accessStore = useAccessStore();
 
   if (
     context.length === 0 &&
@@ -1567,6 +1667,26 @@ function _Chat() {
                                   )
                                 }
                               />
+                              {config.ttsConfig.enable && (
+                                <ChatAction
+                                  text={
+                                    speechStatus
+                                      ? Locale.Chat.Actions.StopSpeech
+                                      : Locale.Chat.Actions.Speech
+                                  }
+                                  loding={speechLoading}
+                                  icon={
+                                    speechStatus ? (
+                                      <SpeakStopIcon />
+                                    ) : (
+                                      <SpeakIcon />
+                                    )
+                                  }
+                                  onClick={() =>
+                                    openaiSpeech(getMessageTextContent(message))
+                                  }
+                                />
+                              )}
                             </>
                           )}
                         </div>
@@ -1714,13 +1834,35 @@ function _Chat() {
               })}
             </div>
           )}
-          <IconButton
+          {config.sttConfig.enable ? (
+            <IconButton
+              icon={<VoiceWhiteIcon />}
+              text={
+                isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak
+              }
+              className={styles["chat-input-send"]}
+              type="primary"
+              onClick={async () =>
+                isListening ? await stopListening() : await startListening()
+              }
+              loding={isTranscription}
+            />
+          ) : (
+            <IconButton
+              icon={<SendWhiteIcon />}
+              text={Locale.Chat.Send}
+              className={styles["chat-input-send"]}
+              type="primary"
+              onClick={() => doSubmit(userInput)}
+            />
+          )}
+          {/* <IconButton
             icon={<SendWhiteIcon />}
             text={Locale.Chat.Send}
             className={styles["chat-input-send"]}
             type="primary"
             onClick={() => doSubmit(userInput)}
-          />
+          /> */}
         </label>
       </div>
 
diff --git a/app/components/settings.tsx b/app/components/settings.tsx
index ca0a5a18796..47a72d79de7 100644
--- a/app/components/settings.tsx
+++ b/app/components/settings.tsx
@@ -80,6 +80,8 @@ import { useSyncStore } from "../store/sync";
 import { nanoid } from "nanoid";
 import { useMaskStore } from "../store/mask";
 import { ProviderType } from "../utils/cloud";
+import { TTSConfigList } from "./tts-config";
+import { STTConfigList } from "./stt-config";
 
 function EditPromptModal(props: { id: string; onClose: () => void }) {
   const promptStore = usePromptStore();
@@ -1646,6 +1648,28 @@ export function Settings() {
           <UserPromptModal onClose={() => setShowPromptModal(false)} />
         )}
 
+        <List>
+          <TTSConfigList
+            ttsConfig={config.ttsConfig}
+            updateConfig={(updater) => {
+              const ttsConfig = { ...config.ttsConfig };
+              updater(ttsConfig);
+              config.update((config) => (config.ttsConfig = ttsConfig));
+            }}
+          />
+        </List>
+
+        <List>
+          <STTConfigList
+            sttConfig={config.sttConfig}
+            updateConfig={(updater) => {
+              const sttConfig = { ...config.sttConfig };
+              updater(sttConfig);
+              config.update((config) => (config.sttConfig = sttConfig));
+            }}
+          />
+        </List>
+
         <DangerItems />
       </div>
     </ErrorBoundary>
diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx
new file mode 100644
index 00000000000..f83d280305f
--- /dev/null
+++ b/app/components/stt-config.tsx
@@ -0,0 +1,51 @@
+import { STTConfig, STTConfigValidator } from "../store";
+
+import Locale from "../locales";
+import { ListItem, Select } from "./ui-lib";
+import { DEFAULT_STT_ENGINES } from "../constant";
+import { isFirefox } from "../utils";
+
+export function STTConfigList(props: {
+  sttConfig: STTConfig;
+  updateConfig: (updater: (config: STTConfig) => void) => void;
+}) {
+  return (
+    <>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.enable = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem>
+      {!isFirefox() && (
+        <ListItem title={Locale.Settings.STT.Engine.Title}>
+          <Select
+            value={props.sttConfig.engine}
+            onChange={(e) => {
+              props.updateConfig(
+                (config) =>
+                  (config.engine = STTConfigValidator.engine(
+                    e.currentTarget.value,
+                  )),
+              );
+            }}
+          >
+            {DEFAULT_STT_ENGINES.map((v, i) => (
+              <option value={v} key={i}>
+                {v}
+              </option>
+            ))}
+          </Select>
+        </ListItem>
+      )}
+    </>
+  );
+}
diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss
new file mode 100644
index 00000000000..ba9f382e40b
--- /dev/null
+++ b/app/components/stt.module.scss
@@ -0,0 +1,119 @@
+@import "../styles/animation.scss";
+.plugin-page {
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+
+  .plugin-page-body {
+    padding: 20px;
+    overflow-y: auto;
+
+    .plugin-filter {
+      width: 100%;
+      max-width: 100%;
+      margin-bottom: 20px;
+      animation: slide-in ease 0.3s;
+      height: 40px;
+
+      display: flex;
+
+      .search-bar {
+        flex-grow: 1;
+        max-width: 100%;
+        min-width: 0;
+        outline: none;
+      }
+
+      .search-bar:focus {
+        border: 1px solid var(--primary);
+      }
+
+      .plugin-filter-lang {
+        height: 100%;
+        margin-left: 10px;
+      }
+
+      .plugin-create {
+        height: 100%;
+        margin-left: 10px;
+        box-sizing: border-box;
+        min-width: 80px;
+      }
+    }
+
+    .plugin-item {
+      display: flex;
+      justify-content: space-between;
+      padding: 20px;
+      border: var(--border-in-light);
+      animation: slide-in ease 0.3s;
+
+      &:not(:last-child) {
+        border-bottom: 0;
+      }
+
+      &:first-child {
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+      }
+
+      &:last-child {
+        border-bottom-left-radius: 10px;
+        border-bottom-right-radius: 10px;
+      }
+
+      .plugin-header {
+        display: flex;
+        align-items: center;
+
+        .plugin-icon {
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          margin-right: 10px;
+        }
+
+        .plugin-title {
+          .plugin-name {
+            font-size: 14px;
+            font-weight: bold;
+          }
+          .plugin-info {
+            font-size: 12px;
+          }
+          .plugin-runtime-warning {
+            font-size: 12px;
+            color: #f86c6c;
+          }
+        }
+      }
+
+      .plugin-actions {
+        display: flex;
+        flex-wrap: nowrap;
+        transition: all ease 0.3s;
+        justify-content: center;
+        align-items: center;
+      }
+
+      @media screen and (max-width: 600px) {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+        box-shadow: var(--card-shadow);
+
+        &:not(:last-child) {
+          border-bottom: var(--border-in-light);
+        }
+
+        .plugin-actions {
+          width: 100%;
+          justify-content: space-between;
+          padding-top: 10px;
+        }
+      }
+    }
+  }
+}
diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx
new file mode 100644
index 00000000000..f86e3bc520a
--- /dev/null
+++ b/app/components/tts-config.tsx
@@ -0,0 +1,132 @@
+import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
+
+import Locale from "../locales";
+import { ListItem, Select } from "./ui-lib";
+import {
+  DEFAULT_TTS_ENGINE,
+  DEFAULT_TTS_ENGINES,
+  DEFAULT_TTS_MODELS,
+  DEFAULT_TTS_VOICES,
+} from "../constant";
+import { InputRange } from "./input-range";
+
+export function TTSConfigList(props: {
+  ttsConfig: TTSConfig;
+  updateConfig: (updater: (config: TTSConfig) => void) => void;
+}) {
+  return (
+    <>
+      <ListItem
+        title={Locale.Settings.TTS.Enable.Title}
+        subTitle={Locale.Settings.TTS.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.ttsConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.enable = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem>
+      {/* <ListItem
+        title={Locale.Settings.TTS.Autoplay.Title}
+        subTitle={Locale.Settings.TTS.Autoplay.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.ttsConfig.autoplay}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.autoplay = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem> */}
+      <ListItem title={Locale.Settings.TTS.Engine}>
+        <Select
+          value={props.ttsConfig.engine}
+          onChange={(e) => {
+            props.updateConfig(
+              (config) =>
+                (config.engine = TTSConfigValidator.engine(
+                  e.currentTarget.value,
+                )),
+            );
+          }}
+        >
+          {DEFAULT_TTS_ENGINES.map((v, i) => (
+            <option value={v} key={i}>
+              {v}
+            </option>
+          ))}
+        </Select>
+      </ListItem>
+      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
+        <>
+          <ListItem title={Locale.Settings.TTS.Model}>
+            <Select
+              value={props.ttsConfig.model}
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.model = TTSConfigValidator.model(
+                      e.currentTarget.value,
+                    )),
+                );
+              }}
+            >
+              {DEFAULT_TTS_MODELS.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+            </Select>
+          </ListItem>
+          <ListItem
+            title={Locale.Settings.TTS.Voice.Title}
+            subTitle={Locale.Settings.TTS.Voice.SubTitle}
+          >
+            <Select
+              value={props.ttsConfig.voice}
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.voice = TTSConfigValidator.voice(
+                      e.currentTarget.value,
+                    )),
+                );
+              }}
+            >
+              {DEFAULT_TTS_VOICES.map((v, i) => (
+                <option value={v} key={i}>
+                  {v}
+                </option>
+              ))}
+            </Select>
+          </ListItem>
+          <ListItem
+            title={Locale.Settings.TTS.Speed.Title}
+            subTitle={Locale.Settings.TTS.Speed.SubTitle}
+          >
+            <InputRange
+              value={props.ttsConfig.speed?.toFixed(1)}
+              min="0.3"
+              max="4.0"
+              step="0.1"
+              onChange={(e) => {
+                props.updateConfig(
+                  (config) =>
+                    (config.speed = TTSConfigValidator.speed(
+                      e.currentTarget.valueAsNumber,
+                    )),
+                );
+              }}
+            ></InputRange>
+          </ListItem>
+        </>
+      )}
+    </>
+  );
+}
diff --git a/app/components/tts.module.scss b/app/components/tts.module.scss
new file mode 100644
index 00000000000..ba9f382e40b
--- /dev/null
+++ b/app/components/tts.module.scss
@@ -0,0 +1,119 @@
+@import "../styles/animation.scss";
+.plugin-page {
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+
+  .plugin-page-body {
+    padding: 20px;
+    overflow-y: auto;
+
+    .plugin-filter {
+      width: 100%;
+      max-width: 100%;
+      margin-bottom: 20px;
+      animation: slide-in ease 0.3s;
+      height: 40px;
+
+      display: flex;
+
+      .search-bar {
+        flex-grow: 1;
+        max-width: 100%;
+        min-width: 0;
+        outline: none;
+      }
+
+      .search-bar:focus {
+        border: 1px solid var(--primary);
+      }
+
+      .plugin-filter-lang {
+        height: 100%;
+        margin-left: 10px;
+      }
+
+      .plugin-create {
+        height: 100%;
+        margin-left: 10px;
+        box-sizing: border-box;
+        min-width: 80px;
+      }
+    }
+
+    .plugin-item {
+      display: flex;
+      justify-content: space-between;
+      padding: 20px;
+      border: var(--border-in-light);
+      animation: slide-in ease 0.3s;
+
+      &:not(:last-child) {
+        border-bottom: 0;
+      }
+
+      &:first-child {
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+      }
+
+      &:last-child {
+        border-bottom-left-radius: 10px;
+        border-bottom-right-radius: 10px;
+      }
+
+      .plugin-header {
+        display: flex;
+        align-items: center;
+
+        .plugin-icon {
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          margin-right: 10px;
+        }
+
+        .plugin-title {
+          .plugin-name {
+            font-size: 14px;
+            font-weight: bold;
+          }
+          .plugin-info {
+            font-size: 12px;
+          }
+          .plugin-runtime-warning {
+            font-size: 12px;
+            color: #f86c6c;
+          }
+        }
+      }
+
+      .plugin-actions {
+        display: flex;
+        flex-wrap: nowrap;
+        transition: all ease 0.3s;
+        justify-content: center;
+        align-items: center;
+      }
+
+      @media screen and (max-width: 600px) {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+        box-shadow: var(--card-shadow);
+
+        &:not(:last-child) {
+          border-bottom: var(--border-in-light);
+        }
+
+        .plugin-actions {
+          width: 100%;
+          justify-content: space-between;
+          padding-top: 10px;
+        }
+      }
+    }
+  }
+}
diff --git a/app/constant.ts b/app/constant.ts
index e88d497ca94..ec0445d2e0e 100644
--- a/app/constant.ts
+++ b/app/constant.ts
@@ -153,6 +153,8 @@ export const Anthropic = {
 
 export const OpenaiPath = {
   ChatPath: "v1/chat/completions",
+  SpeechPath: "v1/audio/speech",
+  TranscriptionPath: "v1/audio/transcriptions",
   ImagePath: "v1/images/generations",
   UsagePath: "dashboard/billing/usage",
   SubsPath: "dashboard/billing/subscription",
@@ -256,6 +258,24 @@ export const KnowledgeCutOffDate: Record<string, string> = {
   "gemini-pro-vision": "2023-12",
 };
 
+export const DEFAULT_TTS_ENGINE = "OpenAI-TTS";
+export const DEFAULT_TTS_ENGINES = ["OpenAI-TTS", "Edge-TTS"];
+export const DEFAULT_TTS_MODEL = "tts-1";
+export const DEFAULT_TTS_VOICE = "alloy";
+export const DEFAULT_TTS_MODELS = ["tts-1", "tts-1-hd"];
+export const DEFAULT_TTS_VOICES = [
+  "alloy",
+  "echo",
+  "fable",
+  "onyx",
+  "nova",
+  "shimmer",
+];
+
+export const DEFAULT_STT_ENGINE = "WebAPI";
+export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"];
+export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
+
 const openaiModels = [
   "gpt-3.5-turbo",
   "gpt-3.5-turbo-1106",
diff --git a/app/icons/speak-stop.svg b/app/icons/speak-stop.svg
new file mode 100644
index 00000000000..926ae7bb3d6
--- /dev/null
+++ b/app/icons/speak-stop.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M17.25 9.75 19.5 12m0 0 2.25 2.25M19.5 12l2.25-2.25M19.5 12l-2.25 2.25m-10.5-6 4.72-4.72a.75.75 0 0 1 1.28.53v15.88a.75.75 0 0 1-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.009 9.009 0 0 1 2.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75Z"></path></svg>
\ No newline at end of file
diff --git a/app/icons/speak.svg b/app/icons/speak.svg
new file mode 100644
index 00000000000..e02212c9a42
--- /dev/null
+++ b/app/icons/speak.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" fill="none" width="16" height="16" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="w-4 h-4"><path stroke-linecap="round" stroke-linejoin="round" d="M19.114 5.636a9 9 0 010 12.728M16.463 8.288a5.25 5.25 0 010 7.424M6.75 8.25l4.72-4.72a.75.75 0 011.28.53v15.88a.75.75 0 01-1.28.53l-4.72-4.72H4.51c-.88 0-1.704-.507-1.938-1.354A9.01 9.01 0 012.25 12c0-.83.112-1.633.322-2.396C2.806 8.756 3.63 8.25 4.51 8.25H6.75z"></path></svg>
\ No newline at end of file
diff --git a/app/icons/voice-white.svg b/app/icons/voice-white.svg
new file mode 100644
index 00000000000..0a4a0ae31cd
--- /dev/null
+++ b/app/icons/voice-white.svg
@@ -0,0 +1,16 @@
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="20" height="20" fill="none" viewBox="0 0 20 20">
+	<defs>
+		<rect id="path_0" width="20" height="20" x="0" y="0" />
+	</defs>
+	<g opacity="1" transform="translate(0 0) rotate(0 8 8)">
+		<mask id="bg-mask-0" fill="#fff">
+			<use xlink:href="#path_0" />
+		</mask>
+		<g mask="url(#bg-mask-0)">
+			<path d="M7 4a3 3 0 016 0v6a3 3 0 11-6 0V4z">
+			</path>
+			<path d="M5.5 9.643a.75.75 0 00-1.5 0V10c0 3.06 2.29 5.585 5.25 5.954V17.5h-1.5a.75.75 0 000 1.5h4.5a.75.75 0 000-1.5h-1.5v-1.546A6.001 6.001 0 0016 10v-.357a.75.75 0 00-1.5 0V10a4.5 4.5 0 01-9 0v-.357z">
+			</path>
+		</g>
+	</g>
+</svg>
diff --git a/app/locales/cn.ts b/app/locales/cn.ts
index 9a3227d68a5..c6aef51402f 100644
--- a/app/locales/cn.ts
+++ b/app/locales/cn.ts
@@ -43,6 +43,8 @@ const cn = {
       Delete: "删除",
       Edit: "编辑",
       FullScreen: "全屏",
+      Speech: "朗读",
+      StopSpeech: "停止",
     },
     Commands: {
       new: "新建聊天",
@@ -76,6 +78,8 @@ const cn = {
       return inputHints + "，/ 触发补全，: 触发命令";
     },
     Send: "发送",
+    StartSpeak: "说话",
+    StopSpeak: "停止",
     Config: {
       Reset: "清除记忆",
       SaveAs: "存为面具",
@@ -481,6 +485,36 @@ const cn = {
       Title: "频率惩罚度 (frequency_penalty)",
       SubTitle: "值越大，越有可能降低重复字词",
     },
+    TTS: {
+      Enable: {
+        Title: "启用文本转语音",
+        SubTitle: "启用文本生成语音服务",
+      },
+      Autoplay: {
+        Title: "启用自动朗读",
+        SubTitle: "自动生成语音并播放，需先开启文本转语音开关",
+      },
+      Model: "模型",
+      Engine: "转换引擎",
+      Voice: {
+        Title: "声音",
+        SubTitle: "生成语音时使用的声音",
+      },
+      Speed: {
+        Title: "速度",
+        SubTitle: "生成语音的速度",
+      },
+    },
+    STT: {
+      Enable: {
+        Title: "启用语音转文本",
+        SubTitle: "启用语音转文本",
+      },
+      Engine: {
+        Title: "转换引擎",
+        SubTitle: "音频转换引擎",
+      },
+    },
   },
   Store: {
     DefaultTopic: "新的聊天",
diff --git a/app/locales/en.ts b/app/locales/en.ts
index 77f3a700ae1..1aa2137ec8d 100644
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -45,6 +45,8 @@ const en: LocaleType = {
       Delete: "Delete",
       Edit: "Edit",
       FullScreen: "FullScreen",
+      Speech: "Play",
+      StopSpeech: "Stop",
     },
     Commands: {
       new: "Start a new chat",
diff --git a/app/locales/index.ts b/app/locales/index.ts
index acdb3e878a1..3078afc7b54 100644
--- a/app/locales/index.ts
+++ b/app/locales/index.ts
@@ -137,3 +137,34 @@ export function getISOLang() {
   const lang = getLang();
   return isoLangString[lang] ?? lang;
 }
+
+const DEFAULT_STT_LANG = "zh-CN";
+export const STT_LANG_MAP: Record<Lang, string> = {
+  cn: "zh-CN",
+  en: "en-US",
+  pt: "pt-BR",
+  tw: "zh-TW",
+  jp: "ja-JP",
+  ko: "ko-KR",
+  id: "id-ID",
+  fr: "fr-FR",
+  es: "es-ES",
+  it: "it-IT",
+  tr: "tr-TR",
+  de: "de-DE",
+  vi: "vi-VN",
+  ru: "ru-RU",
+  cs: "cs-CZ",
+  no: "no-NO",
+  ar: "ar-SA",
+  bn: "bn-BD",
+  sk: "sk-SK",
+};
+
+export function getSTTLang(): string {
+  try {
+    return STT_LANG_MAP[getLang()];
+  } catch {
+    return DEFAULT_STT_LANG;
+  }
+}
diff --git a/app/store/access.ts b/app/store/access.ts
index a1014610e39..0e392e1e92e 100644
--- a/app/store/access.ts
+++ b/app/store/access.ts
@@ -120,6 +120,9 @@ const DEFAULT_ACCESS_STATE = {
   disableFastLink: false,
   customModels: "",
   defaultModel: "",
+
+  // tts config
+  edgeTTSVoiceName: "zh-CN-YunxiNeural",
 };
 
 export const useAccessStore = createPersistStore(
@@ -132,6 +135,12 @@ export const useAccessStore = createPersistStore(
       return get().needCode;
     },
 
+    edgeVoiceName() {
+      this.fetch();
+
+      return get().edgeTTSVoiceName;
+    },
+
     isValidOpenAI() {
       return ensure(get(), ["openaiApiKey"]);
     },
diff --git a/app/store/config.ts b/app/store/config.ts
index e8e3c9863ef..e2de06c9ad8 100644
--- a/app/store/config.ts
+++ b/app/store/config.ts
@@ -5,12 +5,25 @@ import {
   DEFAULT_INPUT_TEMPLATE,
   DEFAULT_MODELS,
   DEFAULT_SIDEBAR_WIDTH,
+  DEFAULT_STT_ENGINE,
+  DEFAULT_STT_ENGINES,
+  DEFAULT_TTS_ENGINE,
+  DEFAULT_TTS_ENGINES,
+  DEFAULT_TTS_MODEL,
+  DEFAULT_TTS_MODELS,
+  DEFAULT_TTS_VOICE,
+  DEFAULT_TTS_VOICES,
   StoreKey,
   ServiceProvider,
 } from "../constant";
 import { createPersistStore } from "../utils/store";
 
 export type ModelType = (typeof DEFAULT_MODELS)[number]["name"];
+export type TTSModelType = (typeof DEFAULT_TTS_MODELS)[number];
+export type TTSVoiceType = (typeof DEFAULT_TTS_VOICES)[number];
+export type TTSEngineType = (typeof DEFAULT_TTS_ENGINES)[number];
+
+export type STTEngineType = (typeof DEFAULT_STT_ENGINES)[number];
 
 export enum SubmitKey {
   Enter = "Enter",
@@ -66,11 +79,26 @@ export const DEFAULT_CONFIG = {
     quality: "standard" as DalleQuality,
     style: "vivid" as DalleStyle,
   },
+
+  ttsConfig: {
+    enable: false,
+    autoplay: false,
+    engine: DEFAULT_TTS_ENGINE,
+    model: DEFAULT_TTS_MODEL,
+    voice: DEFAULT_TTS_VOICE,
+    speed: 1.0,
+  },
+  sttConfig: {
+    enable: false,
+    engine: DEFAULT_STT_ENGINE,
+  },
 };
 
 export type ChatConfig = typeof DEFAULT_CONFIG;
 
 export type ModelConfig = ChatConfig["modelConfig"];
+export type TTSConfig = ChatConfig["ttsConfig"];
+export type STTConfig = ChatConfig["sttConfig"];
 
 export function limitNumber(
   x: number,
@@ -85,6 +113,27 @@ export function limitNumber(
   return Math.min(max, Math.max(min, x));
 }
 
+export const TTSConfigValidator = {
+  engine(x: string) {
+    return x as TTSEngineType;
+  },
+  model(x: string) {
+    return x as TTSModelType;
+  },
+  voice(x: string) {
+    return x as TTSVoiceType;
+  },
+  speed(x: number) {
+    return limitNumber(x, 0.25, 4.0, 1.0);
+  },
+};
+
+export const STTConfigValidator = {
+  engine(x: string) {
+    return x as STTEngineType;
+  },
+};
+
 export const ModalConfigValidator = {
   model(x: string) {
     return x as ModelType;
diff --git a/app/utils/audio.ts b/app/utils/audio.ts
new file mode 100644
index 00000000000..f6828c7aac4
--- /dev/null
+++ b/app/utils/audio.ts
@@ -0,0 +1,45 @@
+type TTSPlayer = {
+  init: () => void;
+  play: (audioBuffer: ArrayBuffer, onended: () => void | null) => Promise<void>;
+  stop: () => void;
+};
+
+export function createTTSPlayer(): TTSPlayer {
+  let audioContext: AudioContext | null = null;
+  let audioBufferSourceNode: AudioBufferSourceNode | null = null;
+
+  const init = () => {
+    audioContext = new (window.AudioContext || window.webkitAudioContext)();
+    audioContext.suspend();
+  };
+
+  const play = async (audioBuffer: ArrayBuffer, onended: () => void | null) => {
+    if (audioBufferSourceNode) {
+      audioBufferSourceNode.stop();
+      audioBufferSourceNode.disconnect();
+    }
+
+    const buffer = await audioContext!.decodeAudioData(audioBuffer);
+    audioBufferSourceNode = audioContext!.createBufferSource();
+    audioBufferSourceNode.buffer = buffer;
+    audioBufferSourceNode.connect(audioContext!.destination);
+    audioContext!.resume().then(() => {
+      audioBufferSourceNode!.start();
+    });
+    audioBufferSourceNode.onended = onended;
+  };
+
+  const stop = () => {
+    if (audioBufferSourceNode) {
+      audioBufferSourceNode.stop();
+      audioBufferSourceNode.disconnect();
+      audioBufferSourceNode = null;
+    }
+    if (audioContext) {
+      audioContext.close();
+      audioContext = null;
+    }
+  };
+
+  return { init, play, stop };
+}
diff --git a/app/utils/ms_edge_tts.ts b/app/utils/ms_edge_tts.ts
new file mode 100644
index 00000000000..f291ebada93
--- /dev/null
+++ b/app/utils/ms_edge_tts.ts
@@ -0,0 +1,391 @@
+// import axios from "axios";
+import { Buffer } from "buffer";
+import { randomBytes } from "crypto";
+import { Readable } from "stream";
+
+// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
+ */
+export enum VOLUME {
+  SILENT = "silent",
+  X_SOFT = "x-soft",
+  SOFT = "soft",
+  MEDIUM = "medium",
+  LOUD = "loud",
+  X_LOUD = "x-LOUD",
+  DEFAULT = "default",
+}
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
+ */
+export enum RATE {
+  X_SLOW = "x-slow",
+  SLOW = "slow",
+  MEDIUM = "medium",
+  FAST = "fast",
+  X_FAST = "x-fast",
+  DEFAULT = "default",
+}
+
+/**
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
+ */
+export enum PITCH {
+  X_LOW = "x-low",
+  LOW = "low",
+  MEDIUM = "medium",
+  HIGH = "high",
+  X_HIGH = "x-high",
+  DEFAULT = "default",
+}
+
+/**
+ * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
+ */
+export enum OUTPUT_FORMAT {
+  // Streaming =============================
+  // AMR_WB_16000HZ = "amr-wb-16000hz",
+  // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
+  // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
+  // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
+  // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
+  // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
+  // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
+  AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
+  AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
+  // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
+  // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
+  // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
+  // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
+  // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
+  // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
+  // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
+  // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
+  // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
+  // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
+  // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
+  // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
+  // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
+  // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
+  // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
+  // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
+  // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
+  // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
+  WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
+  // Non-streaming =============================
+  // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
+  // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
+  // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
+  // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
+  // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
+  // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
+  // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
+}
+
+export type Voice = {
+  Name: string;
+  ShortName: string;
+  Gender: string;
+  Locale: string;
+  SuggestedCodec: string;
+  FriendlyName: string;
+  Status: string;
+};
+
+export class ProsodyOptions {
+  /**
+   * The pitch to use.
+   * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
+   */
+  pitch?: PITCH | string = "+0Hz";
+  /**
+   * The rate to use.
+   * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
+   */
+  rate?: RATE | string | number = 1.0;
+  /**
+   * The volume to use.
+   * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
+   * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
+   */
+  volume?: VOLUME | string | number = 100.0;
+}
+
+export class MsEdgeTTS {
+  static OUTPUT_FORMAT = OUTPUT_FORMAT;
+  private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
+  private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
+  private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
+  private static BINARY_DELIM = "Path:audio\r\n";
+  private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
+  private readonly _enableLogger;
+  private _ws: WebSocket | undefined;
+  private _voice: any;
+  private _voiceLocale: any;
+  private _outputFormat: any;
+  private _streams: { [key: string]: Readable } = {};
+  private _startTime = 0;
+
+  private _log(...o: any[]) {
+    if (this._enableLogger) {
+      console.log(...o);
+    }
+  }
+
+  /**
+   * Create a new `MsEdgeTTS` instance.
+   *
+   * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
+   * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
+   */
+  public constructor(enableLogger: boolean = false) {
+    this._enableLogger = enableLogger;
+  }
+
+  private async _send(message: any) {
+    for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
+      if (i == 1) {
+        this._startTime = Date.now();
+      }
+      this._log("connecting: ", i);
+      await this._initClient();
+    }
+    this._ws!.send(message);
+  }
+
+  private _initClient() {
+    this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
+
+    this._ws.binaryType = "arraybuffer";
+    return new Promise((resolve, reject) => {
+      this._ws!.onopen = () => {
+        this._log(
+          "Connected in",
+          (Date.now() - this._startTime) / 1000,
+          "seconds",
+        );
+        this._send(
+          `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
+                    {
+                        "context": {
+                            "synthesis": {
+                                "audio": {
+                                    "metadataoptions": {
+                                        "sentenceBoundaryEnabled": "false",
+                                        "wordBoundaryEnabled": "false"
+                                    },
+                                    "outputFormat": "${this._outputFormat}" 
+                                }
+                            }
+                        }
+                    }
+                `,
+        ).then(resolve);
+      };
+      this._ws!.onmessage = (m: any) => {
+        const buffer = Buffer.from(m.data as ArrayBuffer);
+        const message = buffer.toString();
+        const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
+        if (message.includes("Path:turn.start")) {
+          // start of turn, ignore
+        } else if (message.includes("Path:turn.end")) {
+          // end of turn, close stream
+          this._streams[requestId].push(null);
+        } else if (message.includes("Path:response")) {
+          // context response, ignore
+        } else if (
+          message.includes("Path:audio") &&
+          m.data instanceof ArrayBuffer
+        ) {
+          this._pushAudioData(buffer, requestId);
+        } else {
+          this._log("UNKNOWN MESSAGE", message);
+        }
+      };
+      this._ws!.onclose = () => {
+        this._log(
+          "disconnected after:",
+          (Date.now() - this._startTime) / 1000,
+          "seconds",
+        );
+        for (const requestId in this._streams) {
+          this._streams[requestId].push(null);
+        }
+      };
+      this._ws!.onerror = function (error: any) {
+        reject("Connect Error: " + error);
+      };
+    });
+  }
+
+  private _pushAudioData(audioBuffer: Buffer, requestId: string) {
+    const audioStartIndex =
+      audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
+      MsEdgeTTS.BINARY_DELIM.length;
+    const audioData = audioBuffer.subarray(audioStartIndex);
+    this._streams[requestId].push(audioData);
+    this._log("received audio chunk, size: ", audioData?.length);
+  }
+
+  private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
+    // in case future updates to the edge API block these elements, we'll be concatenating strings.
+    options = { ...new ProsodyOptions(), ...options };
+    return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
+                <voice name="${this._voice}">
+                    <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
+                        ${input}
+                    </prosody> 
+                </voice>
+            </speak>`;
+  }
+
+  /**
+   * Fetch the list of voices available in Microsoft Edge.
+   * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
+   */
+  // getVoices(): Promise<Voice[]> {
+  //   return new Promise((resolve, reject) => {
+  //     axios
+  //       .get(MsEdgeTTS.VOICES_URL)
+  //       .then((res) => resolve(res.data))
+  //       .catch(reject);
+  //   });
+  // }
+  getVoices(): Promise<Voice[]> {
+    return fetch(MsEdgeTTS.VOICES_URL)
+      .then((response) => {
+        if (!response.ok) {
+          throw new Error("Network response was not ok");
+        }
+        return response.json();
+      })
+      .then((data) => data as Voice[])
+      .catch((error) => {
+        throw error;
+      });
+  }
+
+  /**
+   * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
+   * Must be called at least once before text can be synthesised.
+   * Saved in this instance. Can be called at any time times to update the metadata.
+   *
+   * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
+   * @param outputFormat any {@link OUTPUT_FORMAT}
+   * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
+   */
+  async setMetadata(
+    voiceName: string,
+    outputFormat: OUTPUT_FORMAT,
+    voiceLocale?: string,
+  ) {
+    const oldVoice = this._voice;
+    const oldVoiceLocale = this._voiceLocale;
+    const oldOutputFormat = this._outputFormat;
+
+    this._voice = voiceName;
+    this._voiceLocale = voiceLocale;
+    if (!this._voiceLocale) {
+      const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
+      if (!voiceLangMatch)
+        throw new Error("Could not infer voiceLocale from voiceName!");
+      this._voiceLocale = voiceLangMatch[0];
+    }
+    this._outputFormat = outputFormat;
+
+    const changed =
+      oldVoice !== this._voice ||
+      oldVoiceLocale !== this._voiceLocale ||
+      oldOutputFormat !== this._outputFormat;
+
+    // create new client
+    if (changed || this._ws!.readyState !== this._ws!.OPEN) {
+      this._startTime = Date.now();
+      await this._initClient();
+    }
+  }
+
+  private _metadataCheck() {
+    if (!this._ws)
+      throw new Error(
+        "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
+      );
+  }
+
+  /**
+   * Close the WebSocket connection.
+   */
+  close() {
+    this._ws!.close();
+  }
+
+  /**
+   * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
+   *
+   * @param input the text to synthesise. Can include SSML elements.
+   * @param options (optional) {@link ProsodyOptions}
+   * @returns {Readable} - a `stream.Readable` with the audio data
+   */
+  toStream(input: string, options?: ProsodyOptions): Readable {
+    const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
+    return stream;
+  }
+
+  toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
+    return new Promise((resolve, reject) => {
+      let data: Uint8Array[] = [];
+      const readable = this.toStream(input, options);
+      readable.on("data", (chunk) => {
+        data.push(chunk);
+      });
+
+      readable.on("end", () => {
+        resolve(Buffer.concat(data).buffer);
+      });
+
+      readable.on("error", (err) => {
+        reject(err);
+      });
+    });
+  }
+
+  /**
+   * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
+   *
+   * @param requestSSML the SSML to send. SSML elements required in order to work.
+   * @returns {Readable} - a `stream.Readable` with the audio data
+   */
+  rawToStream(requestSSML: string): Readable {
+    const { stream } = this._rawSSMLRequest(requestSSML);
+    return stream;
+  }
+
+  private _rawSSMLRequest(requestSSML: string): {
+    stream: Readable;
+    requestId: string;
+  } {
+    this._metadataCheck();
+
+    const requestId = randomBytes(16).toString("hex");
+    const request =
+      `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
+                ` + requestSSML.trim();
+    // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
+    const self = this;
+    const stream = new Readable({
+      read() {},
+      destroy(error: Error | null, callback: (error: Error | null) => void) {
+        delete self._streams[requestId];
+        callback(error);
+      },
+    });
+    this._streams[requestId] = stream;
+    this._send(request).then();
+    return { stream, requestId };
+  }
+}
diff --git a/app/utils/speech.ts b/app/utils/speech.ts
new file mode 100644
index 00000000000..dc8102879fb
--- /dev/null
+++ b/app/utils/speech.ts
@@ -0,0 +1,126 @@
+import { ChatGPTApi } from "../client/platforms/openai";
+import { getSTTLang } from "../locales";
+import { isFirefox } from "../utils";
+
+export type TranscriptionCallback = (transcription: string) => void;
+
+export abstract class SpeechApi {
+  protected onTranscription: TranscriptionCallback = () => {};
+
+  abstract isListening(): boolean;
+  abstract start(): Promise<void>;
+  abstract stop(): Promise<void>;
+
+  onTranscriptionReceived(callback: TranscriptionCallback) {
+    this.onTranscription = callback;
+  }
+}
+
+export class OpenAITranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private mediaRecorder: MediaRecorder | null = null;
+  private stream: MediaStream | null = null;
+  private audioChunks: Blob[] = [];
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+  }
+
+  async start(): Promise<void> {
+    // @ts-ignore
+    navigator.getUserMedia =
+      // @ts-ignore
+      navigator.getUserMedia ||
+      // @ts-ignore
+      navigator.webkitGetUserMedia ||
+      // @ts-ignore
+      navigator.mozGetUserMedia ||
+      // @ts-ignore
+      navigator.msGetUserMedia;
+    if (navigator.mediaDevices) {
+      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      this.mediaRecorder = new MediaRecorder(stream);
+      this.mediaRecorder.ondataavailable = (e) => {
+        if (e.data && e.data.size > 0) {
+          this.audioChunks.push(e.data);
+        }
+      };
+
+      this.stream = stream;
+    } else {
+      console.warn("Media Decives will work only with SSL");
+      return;
+    }
+
+    this.audioChunks = [];
+
+    // this.recorder.addEventListener("dataavailable", (event) => {
+    //     this.audioChunks.push(event.data);
+    // });
+
+    this.mediaRecorder.start(1000);
+    this.listeningStatus = true;
+  }
+
+  async stop(): Promise<void> {
+    if (!this.mediaRecorder || !this.listeningStatus) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+      this.mediaRecorder!.addEventListener("stop", async () => {
+        const audioBlob = new Blob(this.audioChunks, { type: "audio/wav" });
+        const llm = new ChatGPTApi();
+        const transcription = await llm.transcription({ file: audioBlob });
+        this.onTranscription(transcription);
+        this.listeningStatus = false;
+        resolve();
+      });
+
+      this.mediaRecorder!.stop();
+    });
+  }
+}
+
+export class WebTranscriptionApi extends SpeechApi {
+  private listeningStatus = false;
+  private recognitionInstance: any | null = null;
+
+  isListening = () => this.listeningStatus;
+
+  constructor(transcriptionCallback?: TranscriptionCallback) {
+    super();
+    if (isFirefox()) return;
+    const SpeechRecognition =
+      (window as any).SpeechRecognition ||
+      (window as any).webkitSpeechRecognition;
+    this.recognitionInstance = new SpeechRecognition();
+    this.recognitionInstance.continuous = true;
+    this.recognitionInstance.interimResults = true;
+    this.recognitionInstance.lang = getSTTLang();
+    if (transcriptionCallback) {
+      this.onTranscriptionReceived(transcriptionCallback);
+    }
+    this.recognitionInstance.onresult = (event: any) => {
+      const result = event.results[event.results.length - 1];
+      if (result.isFinal) {
+        this.onTranscription(result[0].transcript);
+      }
+    };
+  }
+
+  async start(): Promise<void> {
+    this.listeningStatus = true;
+    await this.recognitionInstance.start();
+  }
+
+  async stop(): Promise<void> {
+    this.listeningStatus = false;
+    await this.recognitionInstance.stop();
+  }
+}
diff --git a/package.json b/package.json
index eb0a5ef6735..02d36ae3167 100644
--- a/package.json
+++ b/package.json
@@ -30,6 +30,7 @@
     "html-to-image": "^1.11.11",
     "lodash-es": "^4.17.21",
     "mermaid": "^10.6.1",
+    "markdown-to-txt": "^2.0.1",
     "nanoid": "^5.0.3",
     "next": "^14.1.1",
     "node-fetch": "^3.3.1",
@@ -73,4 +74,4 @@
     "lint-staged/yaml": "^2.2.2"
   },
   "packageManager": "yarn@1.22.19"
-}
+}
\ No newline at end of file
diff --git a/yarn.lock b/yarn.lock
index 793c845d722..3b76a49e780 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -4378,11 +4378,21 @@ lodash.debounce@^4.0.8:
   resolved "https://registry.yarnpkg.com/lodash.debounce/-/lodash.debounce-4.0.8.tgz#82d79bff30a67c4005ffd5e2515300ad9ca4d7af"
   integrity sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==
 
+lodash.escape@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.yarnpkg.com/lodash.escape/-/lodash.escape-4.0.1.tgz#c9044690c21e04294beaa517712fded1fa88de98"
+  integrity sha512-nXEOnb/jK9g0DYMr1/Xvq6l5xMD7GDG55+GSYIYmS0G4tBk/hURD4JR9WCavs04t33WmJx9kCyp9vJ+mr4BOUw==
+
 lodash.merge@^4.6.2:
   version "4.6.2"
   resolved "https://registry.yarnpkg.com/lodash.merge/-/lodash.merge-4.6.2.tgz#558aa53b43b661e1925a0afdfa36a9a1085fe57a"
   integrity sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==
 
+lodash.unescape@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.yarnpkg.com/lodash.unescape/-/lodash.unescape-4.0.1.tgz#bf2249886ce514cda112fae9218cdc065211fc9c"
+  integrity sha512-DhhGRshNS1aX6s5YdBE3njCCouPgnG29ebyHvImlZzXZf2SHgt+J08DHgytTPnpywNbO1Y8mNUFyQuIDBq2JZg==
+
 lodash@^4.17.21:
   version "4.17.21"
   resolved "https://registry.npmmirror.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
@@ -4438,6 +4448,20 @@ markdown-table@^3.0.0:
   resolved "https://registry.yarnpkg.com/markdown-table/-/markdown-table-3.0.3.tgz#e6331d30e493127e031dd385488b5bd326e4a6bd"
   integrity sha512-Z1NL3Tb1M9wH4XESsCDEksWoKTdlUafKc4pt0GRwjUyXaCFZ+dc3g2erqB6zm3szA2IUSi7VnPI+o/9jnxh9hw==
 
+markdown-to-txt@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/markdown-to-txt/-/markdown-to-txt-2.0.1.tgz#bfd6233a2635443cc24900a158b60c6af36ce9c5"
+  integrity sha512-Hsj7KTN8k1gutlLum3vosHwVZGnv8/cbYKWVkUyo/D1rzOYddbDesILebRfOsaVfjIBJank/AVOySBlHAYqfZw==
+  dependencies:
+    lodash.escape "^4.0.1"
+    lodash.unescape "^4.0.1"
+    marked "^4.0.14"
+
+marked@^4.0.14:
+  version "4.3.0"
+  resolved "https://registry.yarnpkg.com/marked/-/marked-4.3.0.tgz#796362821b019f734054582038b116481b456cf3"
+  integrity sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==
+
 mdast-util-definitions@^5.0.0:
   version "5.1.2"
   resolved "https://registry.yarnpkg.com/mdast-util-definitions/-/mdast-util-definitions-5.1.2.tgz#9910abb60ac5d7115d6819b57ae0bcef07a3f7a7"

From 93f1762e6c85e2a71a70534dc8a84b322d3643e7 Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Tue, 27 Aug 2024 17:02:44 +0800
Subject: [PATCH 2/7] chore: wip

---
 app/client/platforms/alibaba.ts   | 7 +++++++
 app/client/platforms/anthropic.ts | 7 +++++++
 app/client/platforms/baidu.ts     | 7 +++++++
 app/client/platforms/bytedance.ts | 7 +++++++
 app/client/platforms/google.ts    | 6 ++++++
 app/client/platforms/iflytek.ts   | 7 +++++++
 app/client/platforms/moonshot.ts  | 7 +++++++
 app/client/platforms/tencent.ts   | 7 +++++++
 8 files changed, 55 insertions(+)

diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
index d5fa3042fc1..477ef193fdc 100644
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -83,6 +83,13 @@ export class QwenApi implements LLMApi {
     return res?.output?.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,
diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts
index b079ba1ada2..df4dc7f3830 100644
--- a/app/client/platforms/anthropic.ts
+++ b/app/client/platforms/anthropic.ts
@@ -73,6 +73,13 @@ const ClaudeMapper = {
 const keys = ["claude-2, claude-instant-1"];
 
 export class ClaudeApi implements LLMApi {
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   extractMessage(res: any) {
     console.log("[Response] claude response: ", res);
 
diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts
index 3be147f4985..2b3119c2a2c 100644
--- a/app/client/platforms/baidu.ts
+++ b/app/client/platforms/baidu.ts
@@ -75,6 +75,13 @@ export class ErnieApi implements LLMApi {
     return [baseUrl, path].join("/");
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function",
diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts
index 7677cafe12b..31c0be3d33b 100644
--- a/app/client/platforms/bytedance.ts
+++ b/app/client/platforms/bytedance.ts
@@ -77,6 +77,13 @@ export class DoubaoApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,
diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts
index 12d8846357a..6c6c3b25e26 100644
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -56,6 +56,12 @@ export class GeminiProApi implements LLMApi {
       ""
     );
   }
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
   async chat(options: ChatOptions): Promise<void> {
     const apiClient = this;
     let multimodal = false;
diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts
index 73cea5ba0e7..77a4571e124 100644
--- a/app/client/platforms/iflytek.ts
+++ b/app/client/platforms/iflytek.ts
@@ -53,6 +53,13 @@ export class SparkApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {
diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts
index 7d257ccb2e6..22bbaf01f46 100644
--- a/app/client/platforms/moonshot.ts
+++ b/app/client/platforms/moonshot.ts
@@ -66,6 +66,13 @@ export class MoonshotApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {
diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts
index 579008a9b9d..5eb48791b01 100644
--- a/app/client/platforms/tencent.ts
+++ b/app/client/platforms/tencent.ts
@@ -89,6 +89,13 @@ export class HunyuanApi implements LLMApi {
     return res.Choices?.at(0)?.Message?.Content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const visionModel = isVisionModel(options.config.model);
     const messages = options.messages.map((v, index) => ({

From f86b220c922a9209e99e2a3647e97ab72f47de3d Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Tue, 27 Aug 2024 19:50:16 +0800
Subject: [PATCH 3/7] feat: add voice action

---
 app/components/chat.tsx   | 113 ++++++++++++++++++--------------------
 app/icons/voice-white.svg |   6 +-
 2 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index e5391ad226c..624b7618e21 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -453,6 +453,7 @@ export function ChatActions(props: {
   showPromptHints: () => void;
   hitBottom: boolean;
   uploading: boolean;
+  setUserInput: (input: string) => void;
 }) {
   const config = useAppConfig();
   const navigate = useNavigate();
@@ -544,6 +545,44 @@ export function ChatActions(props: {
     }
   }, [chatStore, currentModel, models]);
 
+  const [isListening, setIsListening] = useState(false);
+  const [isTranscription, setIsTranscription] = useState(false);
+  const [speechApi, setSpeechApi] = useState<any>(null);
+
+  useEffect(() => {
+    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
+    setSpeechApi(
+      config.sttConfig.engine === DEFAULT_STT_ENGINE
+        ? new WebTranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          )
+        : new OpenAITranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          ),
+    );
+  }, []);
+
+  const startListening = async () => {
+    if (speechApi) {
+      await speechApi.start();
+      setIsListening(true);
+    }
+  };
+  const stopListening = async () => {
+    if (speechApi) {
+      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+        setIsTranscription(true);
+      await speechApi.stop();
+      setIsListening(false);
+    }
+  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    console.log(finalTranscript);
+    if (finalTranscript) props.setUserInput(finalTranscript);
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
+
   return (
     <div className={styles["chat-input-actions"]}>
       {couldStop && (
@@ -768,6 +807,16 @@ export function ChatActions(props: {
           }}
         />
       )}
+
+      {config.sttConfig.enable && (
+        <ChatAction
+          onClick={async () =>
+            isListening ? await stopListening() : await startListening()
+          }
+          text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
+          icon={<VoiceWhiteIcon />}
+        />
+      )}
     </div>
   );
 }
@@ -940,33 +989,6 @@ function _Chat() {
     }
   };
 
-  const [isListening, setIsListening] = useState(false);
-  const [isTranscription, setIsTranscription] = useState(false);
-  const [speechApi, setSpeechApi] = useState<any>(null);
-
-  const startListening = async () => {
-    if (speechApi) {
-      await speechApi.start();
-      setIsListening(true);
-    }
-  };
-
-  const stopListening = async () => {
-    if (speechApi) {
-      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
-        setIsTranscription(true);
-      await speechApi.stop();
-      setIsListening(false);
-    }
-  };
-
-  const onRecognitionEnd = (finalTranscript: string) => {
-    console.log(finalTranscript);
-    if (finalTranscript) setUserInput(finalTranscript);
-    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
-      setIsTranscription(false);
-  };
-
   const doSubmit = (userInput: string) => {
     if (userInput.trim() === "") return;
     const matchCommand = chatCommands.match(userInput);
@@ -1037,16 +1059,6 @@ function _Chat() {
       }
     });
     // eslint-disable-next-line react-hooks/exhaustive-deps
-    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
-    setSpeechApi(
-      config.sttConfig.engine === DEFAULT_STT_ENGINE
-        ? new WebTranscriptionApi((transcription) =>
-            onRecognitionEnd(transcription),
-          )
-        : new OpenAITranscriptionApi((transcription) =>
-            onRecognitionEnd(transcription),
-          ),
-    );
   }, []);
 
   // check if should send message
@@ -1784,6 +1796,7 @@ function _Chat() {
             setUserInput("/");
             onSearch("");
           }}
+          setUserInput={setUserInput}
         />
         <label
           className={`${styles["chat-input-panel-inner"]} ${
@@ -1834,35 +1847,13 @@ function _Chat() {
               })}
             </div>
           )}
-          {config.sttConfig.enable ? (
-            <IconButton
-              icon={<VoiceWhiteIcon />}
-              text={
-                isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak
-              }
-              className={styles["chat-input-send"]}
-              type="primary"
-              onClick={async () =>
-                isListening ? await stopListening() : await startListening()
-              }
-              loding={isTranscription}
-            />
-          ) : (
-            <IconButton
-              icon={<SendWhiteIcon />}
-              text={Locale.Chat.Send}
-              className={styles["chat-input-send"]}
-              type="primary"
-              onClick={() => doSubmit(userInput)}
-            />
-          )}
-          {/* <IconButton
+          <IconButton
             icon={<SendWhiteIcon />}
             text={Locale.Chat.Send}
             className={styles["chat-input-send"]}
             type="primary"
             onClick={() => doSubmit(userInput)}
-          /> */}
+          />
         </label>
       </div>
 
diff --git a/app/icons/voice-white.svg b/app/icons/voice-white.svg
index 0a4a0ae31cd..e7d5cbcc86f 100644
--- a/app/icons/voice-white.svg
+++ b/app/icons/voice-white.svg
@@ -1,4 +1,4 @@
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="20" height="20" fill="none" viewBox="0 0 20 20">
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="16" height="16" fill="none" viewBox="0 0 20 20">
 	<defs>
 		<rect id="path_0" width="20" height="20" x="0" y="0" />
 	</defs>
@@ -7,9 +7,9 @@
 			<use xlink:href="#path_0" />
 		</mask>
 		<g mask="url(#bg-mask-0)">
-			<path d="M7 4a3 3 0 016 0v6a3 3 0 11-6 0V4z">
+			<path d="M7 4a3 3 0 016 0v6a3 3 0 11-6 0V4z" fill="#333333">
 			</path>
-			<path d="M5.5 9.643a.75.75 0 00-1.5 0V10c0 3.06 2.29 5.585 5.25 5.954V17.5h-1.5a.75.75 0 000 1.5h4.5a.75.75 0 000-1.5h-1.5v-1.546A6.001 6.001 0 0016 10v-.357a.75.75 0 00-1.5 0V10a4.5 4.5 0 01-9 0v-.357z">
+			<path d="M5.5 9.643a.75.75 0 00-1.5 0V10c0 3.06 2.29 5.585 5.25 5.954V17.5h-1.5a.75.75 0 000 1.5h4.5a.75.75 0 000-1.5h-1.5v-1.546A6.001 6.001 0 0016 10v-.357a.75.75 0 00-1.5 0V10a4.5 4.5 0 01-9 0v-.357z" fill="#333333">
 			</path>
 		</g>
 	</g>

From e9f90a4d82edbb446aedaef7ae27984d21b870d4 Mon Sep 17 00:00:00 2001
From: Meaqua <lzlloveyou@vip.qq.com>
Date: Tue, 27 Aug 2024 21:49:00 +0800
Subject: [PATCH 4/7] fix: i18n

---
 app/locales/en.ts | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/app/locales/en.ts b/app/locales/en.ts
index 1aa2137ec8d..ae20a0d4f89 100644
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -490,6 +490,37 @@ const en: LocaleType = {
       SubTitle:
         "A larger value decreasing the likelihood to repeat the same line",
     },
+    TTS: {
+      Enable: {
+        Title: "Enable TTS",
+        SubTitle: "Enable text-to-speech service",
+      },
+      Autoplay: {
+        Title: "Enable Autoplay",
+        SubTitle:
+          "Automatically generate speech and play, you need to enable the text-to-speech switch first",
+      },
+      Model: "Model",
+      Voice: {
+        Title: "Voice",
+        SubTitle: "The voice to use when generating the audio",
+      },
+      Speed: {
+        Title: "Speed",
+        SubTitle: "The speed of the generated audio",
+      },
+      Engine: "TTS Engine",
+    },
+    STT: {
+      Enable: {
+        Title: "Enable STT",
+        SubTitle: "Enable Speech-to-Text",
+      },
+      Engine: {
+        Title: "STT Engine",
+        SubTitle: "Text-to-Speech Engine",
+      },
+    },
   },
   Store: {
     DefaultTopic: "New Conversation",

From ed5aea0521797841981919fa3c1ebb6340c35168 Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Wed, 28 Aug 2024 12:37:19 +0800
Subject: [PATCH 5/7] fix: bug

---
 app/client/platforms/alibaba.ts   |  2 ++
 app/client/platforms/anthropic.ts |  9 ++++++++-
 app/client/platforms/baidu.ts     |  2 ++
 app/client/platforms/bytedance.ts |  2 ++
 app/client/platforms/google.ts    | 10 +++++++++-
 app/client/platforms/iflytek.ts   |  9 ++++++++-
 app/client/platforms/moonshot.ts  |  2 ++
 app/client/platforms/tencent.ts   |  2 ++
 8 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
index 477ef193fdc..e839c69f01f 100644
--- a/app/client/platforms/alibaba.ts
+++ b/app/client/platforms/alibaba.ts
@@ -12,6 +12,8 @@ import {
   getHeaders,
   LLMApi,
   LLMModel,
+  SpeechOptions,
+  TranscriptionOptions,
   MultimodalContent,
 } from "../api";
 import Locale from "../../locales";
diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts
index df4dc7f3830..f0f95f0fd98 100644
--- a/app/client/platforms/anthropic.ts
+++ b/app/client/platforms/anthropic.ts
@@ -1,5 +1,12 @@
 import { ACCESS_CODE_PREFIX, Anthropic, ApiPath } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, MultimodalContent } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 import { getClientConfig } from "@/app/config/client";
 import { DEFAULT_API_HOST } from "@/app/constant";
diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts
index 2b3119c2a2c..0c2be5fb14b 100644
--- a/app/client/platforms/baidu.ts
+++ b/app/client/platforms/baidu.ts
@@ -14,6 +14,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts
index 31c0be3d33b..5a0c9b8b12e 100644
--- a/app/client/platforms/bytedance.ts
+++ b/app/client/platforms/bytedance.ts
@@ -13,6 +13,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts
index 6c6c3b25e26..c8d3658b350 100644
--- a/app/client/platforms/google.ts
+++ b/app/client/platforms/google.ts
@@ -1,5 +1,13 @@
 import { ApiPath, Google, REQUEST_TIMEOUT_MS } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, LLMModel, LLMUsage } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  LLMModel,
+  LLMUsage,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 import { getClientConfig } from "@/app/config/client";
 import { DEFAULT_API_HOST } from "@/app/constant";
diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts
index 77a4571e124..6463e052e40 100644
--- a/app/client/platforms/iflytek.ts
+++ b/app/client/platforms/iflytek.ts
@@ -7,7 +7,14 @@ import {
 } from "@/app/constant";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 
-import { ChatOptions, getHeaders, LLMApi, LLMModel } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  LLMModel,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import Locale from "../../locales";
 import {
   EventStreamContentType,
diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts
index 22bbaf01f46..b5a8aa5880d 100644
--- a/app/client/platforms/moonshot.ts
+++ b/app/client/platforms/moonshot.ts
@@ -20,6 +20,8 @@ import {
   LLMModel,
   LLMUsage,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts
index 5eb48791b01..1739b7a142b 100644
--- a/app/client/platforms/tencent.ts
+++ b/app/client/platforms/tencent.ts
@@ -8,6 +8,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {

From 318e0989a2c28ae323d3f00d8256a7e48169e4a6 Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Wed, 28 Aug 2024 13:13:41 +0800
Subject: [PATCH 6/7] fix: transcription headers

---
 app/client/api.ts             | 13 ++++++++-----
 app/components/chat.tsx       |  1 -
 app/components/tts-config.tsx |  3 ++-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/app/client/api.ts b/app/client/api.ts
index 8d0877a0d4d..7e1d0135ed6 100644
--- a/app/client/api.ts
+++ b/app/client/api.ts
@@ -220,13 +220,16 @@ export function validString(x: string): boolean {
   return x?.length > 0;
 }
 
-export function getHeaders() {
+export function getHeaders(ignoreHeaders?: boolean) {
   const accessStore = useAccessStore.getState();
   const chatStore = useChatStore.getState();
-  const headers: Record<string, string> = {
-    "Content-Type": "application/json",
-    Accept: "application/json",
-  };
+  let headers: Record<string, string> = {};
+  if (!ignoreHeaders) {
+    headers = {
+      "Content-Type": "application/json",
+      Accept: "application/json",
+    };
+  }
 
   const clientConfig = getClientConfig();
 
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
index 624b7618e21..f4ebd70d88d 100644
--- a/app/components/chat.tsx
+++ b/app/components/chat.tsx
@@ -1686,7 +1686,6 @@ function _Chat() {
                                       ? Locale.Chat.Actions.StopSpeech
                                       : Locale.Chat.Actions.Speech
                                   }
-                                  loding={speechLoading}
                                   icon={
                                     speechStatus ? (
                                       <SpeakStopIcon />
diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx
index f86e3bc520a..39ae85730c2 100644
--- a/app/components/tts-config.tsx
+++ b/app/components/tts-config.tsx
@@ -1,4 +1,4 @@
-import { PluginConfig, TTSConfig, TTSConfigValidator } from "../store";
+import { TTSConfig, TTSConfigValidator } from "../store";
 
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
@@ -111,6 +111,7 @@ export function TTSConfigList(props: {
             subTitle={Locale.Settings.TTS.Speed.SubTitle}
           >
             <InputRange
+              aria={Locale.Settings.TTS.Speed.Title}
               value={props.ttsConfig.speed?.toFixed(1)}
               min="0.3"
               max="4.0"

From c5168c213257d44ab8b637dc267a194000c76ea7 Mon Sep 17 00:00:00 2001
From: DDMeaqua <lzlloveyou@vip.qq.com>
Date: Wed, 28 Aug 2024 13:15:52 +0800
Subject: [PATCH 7/7] fix: i18n

---
 app/locales/en.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/app/locales/en.ts b/app/locales/en.ts
index ae20a0d4f89..dd13ff99cc5 100644
--- a/app/locales/en.ts
+++ b/app/locales/en.ts
@@ -80,6 +80,8 @@ const en: LocaleType = {
       return inputHints + ", / to search prompts, : to use commands";
     },
     Send: "Send",
+    StartSpeak: "Start Speak",
+    StopSpeak: "Stop Speak",
     Config: {
       Reset: "Reset to Default",
       SaveAs: "Save as Mask",