ChatGPTNextWeb · DDMeaqua · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/app/client/api.ts b/app/client/api.ts
@@ -26,6 +26,7 @@ export const ROLES = ["system", "user", "assistant"] as const;
 export type MessageRole = (typeof ROLES)[number];
 
 export const Models = ["gpt-3.5-turbo", "gpt-4"] as const;
+export const TTSModels = ["tts-1", "tts-1-hd"] as const;
 export type ChatModel = ModelType;
 
 export interface MultimodalContent {
@@ -54,6 +55,25 @@ export interface LLMConfig {
   style?: DalleRequestPayload["style"];
 }
 
+export interface SpeechOptions {
+  model: string;
+  input: string;
+  voice: string;
+  response_format?: string;
+  speed?: number;
+  onController?: (controller: AbortController) => void;
+}
+
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
   messages: RequestMessage[];
   config: LLMConfig;
@@ -88,6 +108,8 @@ export interface LLMModelProvider {
 
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }
@@ -206,13 +228,16 @@ export function validString(x: string): boolean {
   return x?.length > 0;
 }
 
-export function getHeaders() {
+export function getHeaders(ignoreHeaders?: boolean) {
   const accessStore = useAccessStore.getState();
   const chatStore = useChatStore.getState();
-  const headers: Record<string, string> = {
-    "Content-Type": "application/json",
-    Accept: "application/json",
-  };
+  let headers: Record<string, string> = {};
+  if (!ignoreHeaders) {
+    headers = {
+      "Content-Type": "application/json",
+      Accept: "application/json",
+    };
+  }
 
   const clientConfig = getClientConfig();
 

diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
@@ -12,6 +12,8 @@ import {
   getHeaders,
   LLMApi,
   LLMModel,
+  SpeechOptions,
+  TranscriptionOptions,
   MultimodalContent,
 } from "../api";
 import Locale from "../../locales";
@@ -83,6 +85,13 @@ export class QwenApi implements LLMApi {
     return res?.output?.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,

diff --git a/app/client/platforms/anthropic.ts b/app/client/platforms/anthropic.ts
@@ -1,5 +1,12 @@
 import { ACCESS_CODE_PREFIX, Anthropic, ApiPath } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, MultimodalContent } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import {
   useAccessStore,
   useAppConfig,
@@ -80,6 +87,13 @@ const ClaudeMapper = {
 const keys = ["claude-2, claude-instant-1"];
 
 export class ClaudeApi implements LLMApi {
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   extractMessage(res: any) {
     console.log("[Response] claude response: ", res);
 

diff --git a/app/client/platforms/baidu.ts b/app/client/platforms/baidu.ts
@@ -14,6 +14,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -75,6 +77,13 @@ export class ErnieApi implements LLMApi {
     return [baseUrl, path].join("/");
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       // "error_code": 336006, "error_msg": "the role of message with even index in the messages must be user or function",

diff --git a/app/client/platforms/bytedance.ts b/app/client/platforms/bytedance.ts
@@ -13,6 +13,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -77,6 +79,13 @@ export class DoubaoApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages = options.messages.map((v) => ({
       role: v.role,

diff --git a/app/client/platforms/google.ts b/app/client/platforms/google.ts
@@ -1,5 +1,13 @@
 import { ApiPath, Google, REQUEST_TIMEOUT_MS } from "@/app/constant";
-import { ChatOptions, getHeaders, LLMApi, LLMModel, LLMUsage } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  LLMModel,
+  LLMUsage,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 import { getClientConfig } from "@/app/config/client";
 import { DEFAULT_API_HOST } from "@/app/constant";
@@ -56,6 +64,12 @@ export class GeminiProApi implements LLMApi {
       ""
     );
   }
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
   async chat(options: ChatOptions): Promise<void> {
     const apiClient = this;
     let multimodal = false;

diff --git a/app/client/platforms/iflytek.ts b/app/client/platforms/iflytek.ts
@@ -7,7 +7,14 @@ import {
 } from "@/app/constant";
 import { useAccessStore, useAppConfig, useChatStore } from "@/app/store";
 
-import { ChatOptions, getHeaders, LLMApi, LLMModel } from "../api";
+import {
+  ChatOptions,
+  getHeaders,
+  LLMApi,
+  LLMModel,
+  SpeechOptions,
+  TranscriptionOptions,
+} from "../api";
 import Locale from "../../locales";
 import {
   EventStreamContentType,
@@ -53,6 +60,13 @@ export class SparkApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {

diff --git a/app/client/platforms/moonshot.ts b/app/client/platforms/moonshot.ts
@@ -26,6 +26,8 @@ import {
   LLMModel,
   LLMUsage,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -72,6 +74,13 @@ export class MoonshotApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const messages: ChatOptions["messages"] = [];
     for (const v of options.messages) {

diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
@@ -33,6 +33,8 @@ import {
   LLMModel,
   LLMUsage,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -84,7 +86,7 @@ export interface DalleRequestPayload {
 export class ChatGPTApi implements LLMApi {
   private disableListModels = true;
 
-  path(path: string): string {
+  path(path: string, model?: string): string {
     const accessStore = useAccessStore.getState();
 
     let baseUrl = "";
@@ -147,6 +149,85 @@ export class ChatGPTApi implements LLMApi {
     return res.choices?.at(0)?.message?.content ?? res;
   }
 
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    const requestPayload = {
+      model: options.model,
+      input: options.input,
+      voice: options.voice,
+      response_format: options.response_format,
+      speed: options.speed,
+    };
+
+    console.log("[Request] openai speech payload: ", requestPayload);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const speechPath = this.path(OpenaiPath.SpeechPath, options.model);
+      const speechPayload = {
+        method: "POST",
+        body: JSON.stringify(requestPayload),
+        signal: controller.signal,
+        headers: getHeaders(),
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+
+      const res = await fetch(speechPath, speechPayload);
+      clearTimeout(requestTimeoutId);
+      return await res.arrayBuffer();
+    } catch (e) {
+      console.log("[Request] failed to make a speech request", e);
+      throw e;
+    }
+  }
+
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
+
   async chat(options: ChatOptions) {
     const modelConfig = {
       ...useAppConfig.getState().modelConfig,

diff --git a/app/client/platforms/tencent.ts b/app/client/platforms/tencent.ts
@@ -8,6 +8,8 @@ import {
   LLMApi,
   LLMModel,
   MultimodalContent,
+  SpeechOptions,
+  TranscriptionOptions,
 } from "../api";
 import Locale from "../../locales";
 import {
@@ -89,6 +91,13 @@ export class HunyuanApi implements LLMApi {
     return res.Choices?.at(0)?.Message?.Content ?? "";
   }
 
+  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+    throw new Error("Method not implemented.");
+  }
+  transcription(options: TranscriptionOptions): Promise<string> {
+    throw new Error("Method not implemented.");
+  }
+
   async chat(options: ChatOptions) {
     const visionModel = isVisionModel(options.config.model);
     const messages = options.messages.map((v, index) => ({