fix: fix callbackQuery whitelist judgment error

chore: Extract overall query logic feat: add support for 4o-audio-preview model (due to sdk limitations, currently only text output is possible), ffmpeg processing of audio is required, only effective in nodejs environment
adolphnov · Nov 27, 2024 · 166dab3 · 166dab3
1 parent 0bcfaca
commit 166dab3
Show file tree

Hide file tree

Showing 24 changed files with 23,991 additions and 996 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,16 +1,17 @@
-FROM node:alpine as DEV
+FROM node:alpine AS DEV
 
 WORKDIR /app
 COPY package.json vite.config.ts tsconfig.json ./
 COPY src ./src
 RUN npm install && npm run build:local
 
-FROM node:alpine as PROD
+FROM node:alpine AS PROD
 
 WORKDIR /app
 COPY --from=DEV /app/dist/index.js /app/dist/index.js
 COPY --from=DEV /app/package.json /app/
 RUN npm install --only=production --omit=dev && \
-apk add --no-cache sqlite
+    npm cache clean --force && \
+    apk add --no-cache sqlite ffmpeg
 EXPOSE 8787
 CMD ["npm", "run", "start:dist"]
diff --git a/dist/buildinfo.json b/dist/buildinfo.json
diff --git a/dist/index.js b/dist/index.js
diff --git a/dist/timestamp b/dist/timestamp
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
     "name": "chatgpt-telegram-workers",
     "type": "module",
-    "version": "2.0.3",
+    "version": "2.0.4",
     "description": "The easiest and quickest way to deploy your own ChatGPT Telegram bot is to use a single file and simply copy and paste it. There is no need for any dependencies, local development environment configuration, domain names, or servers.",
     "author": "tbxark <tbxark@outlook.com>",
     "license": "MIT",
@@ -41,54 +41,56 @@
     },
     "dependencies": {
         "@ai-sdk/anthropic": "^1.0.2",
-        "@ai-sdk/azure": "^1.0.5",
+        "@ai-sdk/azure": "^1.0.7",
         "@ai-sdk/cohere": "^1.0.3",
-        "@ai-sdk/google": "^1.0.3",
-        "@ai-sdk/google-vertex": "^1.0.3",
+        "@ai-sdk/google": "^1.0.4",
+        "@ai-sdk/google-vertex": "^1.0.4",
         "@ai-sdk/mistral": "^1.0.3",
-        "@ai-sdk/openai": "^1.0.4",
+        "@ai-sdk/openai": "^1.0.5",
         "@ai-sdk/xai": "^1.0.3",
-        "ai": "^4.0.3",
+        "ai": "^4.0.6",
+        "base64-stream": "^1.0.0",
         "cloudflare-worker-adapter": "^1.3.4",
-        "node-cron": "^3.0.3",
-        "ws": "^8.18.0"
+        "fluent-ffmpeg": "^2.1.3",
+        "node-cron": "^3.0.3"
     },
     "devDependencies": {
         "@ai-sdk/anthropic": "^1.0.2",
-        "@ai-sdk/azure": "^1.0.5",
+        "@ai-sdk/azure": "^1.0.7",
         "@ai-sdk/cohere": "^1.0.3",
-        "@ai-sdk/google": "^1.0.3",
-        "@ai-sdk/google-vertex": "^1.0.3",
+        "@ai-sdk/google": "^1.0.4",
+        "@ai-sdk/google-vertex": "^1.0.4",
         "@ai-sdk/mistral": "^1.0.3",
-        "@ai-sdk/openai": "^1.0.4",
-        "@antfu/eslint-config": "^3.9.2",
+        "@ai-sdk/openai": "^1.0.5",
+        "@antfu/eslint-config": "^3.11.0",
         "@cloudflare/workers-types": "^4.20241112.0",
         "@google-cloud/vertexai": "^1.9.0",
-        "@navetacandra/ddg": "^0.0.6",
         "@rollup/plugin-node-resolve": "^15.3.0",
+        "@types/base64-stream": "^1.0.5",
         "@types/fluent-ffmpeg": "^2.1.27",
-        "@types/node": "^22.9.3",
+        "@types/node": "^22.10.0",
         "@types/node-cron": "^3.0.11",
         "@types/react": "^18.3.12",
         "@types/react-dom": "^18.3.1",
         "@types/ws": "^8.5.13",
-        "@vercel/node": "^3.2.26",
-        "ai": "^4.0.3",
+        "@vercel/node": "^3.2.27",
+        "ai": "^4.0.6",
+        "base64-stream": "^1.0.0",
         "eslint": "^9.15.0",
         "eslint-plugin-format": "^0.1.2",
         "fluent-ffmpeg": "^2.1.3",
         "gts": "^6.0.2",
-        "openai": "^4.73.0",
+        "openai": "^4.73.1",
         "react-dom": "^18.3.1",
         "rollup-plugin-cleanup": "^3.2.1",
         "rollup-plugin-node-externals": "^7.1.3",
-        "telegram-bot-api-types": "^7.11.0",
+        "telegram-bot-api-types": "^8.0.0",
         "tsx": "^4.19.2",
         "typescript": "^5.7.2",
-        "vite": "^5.4.11",
+        "vite": "^6.0.1",
         "vite-plugin-checker": "^0.8.0",
         "vite-plugin-dts": "^4.3.0",
-        "wrangler": "^3.90.0",
+        "wrangler": "^3.91.0",
         "ws": "^8.18.0"
     }
 }
diff --git a/scripts/plugins/docker/index.ts b/scripts/plugins/docker/index.ts
@@ -26,17 +26,18 @@ const packageJson = `
   },
   "dependencies": {
     "@ai-sdk/anthropic": "^1.0.2",
-    "@ai-sdk/azure": "^1.0.5",
+    "@ai-sdk/azure": "^1.0.7",
     "@ai-sdk/cohere": "^1.0.3",
-    "@ai-sdk/google": "^1.0.3",
-    "@ai-sdk/google-vertex": "^1.0.3",
+    "@ai-sdk/google": "^1.0.4",
+    "@ai-sdk/google-vertex": "^1.0.4",
     "@ai-sdk/mistral": "^1.0.3",
-    "@ai-sdk/openai": "^1.0.4",
+    "@ai-sdk/openai": "^1.0.5",
     "@ai-sdk/xai": "^1.0.3",
-    "ai": "^4.0.3",
+    "ai": "^4.0.6",
+    "base64-stream": "^1.0.0",
     "cloudflare-worker-adapter": "^1.3.4",
-    "node-cron": "^3.0.3",
-    "ws": "^8.18.0"
+    "fluent-ffmpeg": "^2.1.3",
+    "node-cron": "^3.0.3"
   },
   "devDependencies": {}
 }

diff --git a/src/agent/model_middleware.ts b/src/agent/model_middleware.ts
@@ -25,6 +25,7 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,
             // await warpModel(model, config, activeTools, (params.mode as any).toolChoice, chatModel);
             recordModelLog(config, model, activeTools, (params.mode as any).toolChoice);
             const result = await doGenerate();
+            log.debug(`doGenerate result: ${JSON.stringify(result)}`);
             return result;
         },
 
@@ -54,6 +55,7 @@ export function AIMiddleware({ config, tools, activeTools, onStream, toolChoice,
 
         onChunk: (data: any) => {
             const { chunk } = data;
+            log.debug(`chunk: ${JSON.stringify(chunk)}`);
             if (chunk.type === 'tool-call' && !sendToolCall) {
                 onStream?.send(`${messageReferencer.join('')}...\n` + `tool call will start: ${chunk.toolName}`);
                 sendToolCall = true;

diff --git a/src/agent/openai.ts b/src/agent/openai.ts
@@ -25,7 +25,15 @@ export class OpenAI extends OpenAIBase implements ChatAgent {
     };
 
     readonly model = (ctx: AgentUserConfig, params?: LLMChatRequestParams): string => {
-        return Array.isArray(params?.content) ? ctx.OPENAI_VISION_MODEL : ctx.OPENAI_CHAT_MODEL;
+        const msgType = Array.isArray(params?.content) ? params.content.at(-1)?.type : 'text';
+        switch (msgType) {
+            case 'image':
+                return ctx.OPENAI_VISION_MODEL;
+            case 'file':
+                return 'gpt-4o-audio-preview';
+            default:
+                return ctx.OPENAI_CHAT_MODEL;
+        }
     };
 
     readonly request = async (params: LLMChatParams, context: AgentUserConfig, onStream: ChatStreamTextHandler | null): Promise<{ messages: ResponseMessage[]; content: string }> => {
@@ -35,6 +43,7 @@ export class OpenAI extends OpenAIBase implements ChatAgent {
             baseURL: context.OPENAI_API_BASE,
             apiKey: this.apikey(context),
             compatibility: 'strict',
+            // fetch: this.fetch,
         });
 
         const languageModelV1 = provider.languageModel(originalModel, undefined);
@@ -73,8 +82,12 @@ export class OpenAI extends OpenAIBase implements ChatAgent {
 
     readonly fetch = async (url: RequestInfo | URL, options?: RequestInit): Promise<Response> => {
         const body = JSON.parse(options?.body as string);
-        if (body?.model.startsWith(OpenAI.transformModelPerfix)) {
-            body.model = body.model.slice(OpenAI.transformModelPerfix.length);
+        // if (body?.model.startsWith(OpenAI.transformModelPerfix)) {
+        //     body.model = body.model.slice(OpenAI.transformModelPerfix.length);
+        // }
+        if (body.model === 'gpt-4o-audio-preview') {
+            body.modalities = ['text', 'audio'];
+            body.audio = { voice: 'alloy', format: 'opus' };
         }
         return fetch(url, {
             ...options,

diff --git a/src/agent/request.ts b/src/agent/request.ts
@@ -207,7 +207,7 @@ export async function requestChatCompletionsV2(params: { model: LanguageModelV1;
             activeTools: params.activeTools,
             onStepFinish: middleware.onStepFinish as (data: StepResult<any>) => void,
         };
-        if (onStream !== null) {
+        if (onStream !== null /* && params.model.modelId !== 'gpt-4o-audio-preview' */) {
             const stream = streamText({
                 ...hander_params,
                 onChunk: middleware.onChunk as (data: any) => void,

diff --git a/src/config/config.ts b/src/config/config.ts
@@ -382,13 +382,13 @@ export class ExtraUserConfig {
     RERANK_MODELS: string[] = ['gpt-4o-mini', 'gpt-4o-2024-05-13', 'gpt-4o-2024-08-06', 'chatgpt-4o-latest', 'o1-mini', 'o1-preview', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20241012', 'gemini-1.5-flash-002', 'gemini-1.5-pro-002', 'gemini-1.5-flash-latest', 'gemini-1.5-pro-latest', 'gemini-exp-1114', 'grok-beta', 'grok-vision-beta', 'claude-3-5-haiku-20241012'];
     // Whether to enable intelligent model processing
     ENABLE_INTELLIGENT_MODEL = false;
-    // text handle type, to asr or or just 'text' to chat with llm
-    TEXT_HANDLE_TYPE = 'text';
-    // Text output type, 'audio' or 'text'
+    // text handle type, to asr or 'text' to chat with llm, or 'chat' by using audio-preview (default: text)
+    TEXT_HANDLE_TYPE = 'chat';
+    // Text output type, 'audio' or 'text' (default: text)
     TEXT_OUTPUT = 'text';
-    // Audio handle type, 'trans' or just 'audio' to chat with llm
-    AUDIO_HANDLE_TYPE = 'trans';
-    // Audio output type, 'audio' or 'text'
+    // Audio handle type, 'trans' or 'audio' to chat with llm, or 'chat' by using audio-preview (default: trans)
+    AUDIO_HANDLE_TYPE = 'chat';
+    // Audio output type, 'audio' or 'text' (default: text)
     AUDIO_OUTPUT = 'text';
     // Audio contains text
     AUDIO_CONTAINS_TEXT = true;

diff --git a/src/config/context.ts b/src/config/context.ts
@@ -140,89 +140,3 @@ export class WorkerContext implements WorkerContextBase {
         return new WorkerContext(USER_CONFIG, SHARE_CONTEXT, MIDDLE_CONTEXT);
     }
 }
-
-export class CallbackQueryContext {
-    data: string;
-    query_id: string;
-    from: Telegram.User;
-    USER_CONFIG: AgentUserConfig;
-    SHARE_CONTEXT: ShareContext;
-
-    constructor(callbackQuery: Telegram.CallbackQuery, workContext: WorkerContext) {
-        this.data = callbackQuery.data!;
-        this.query_id = callbackQuery.id;
-        this.from = callbackQuery.from!;
-        this.USER_CONFIG = workContext.USER_CONFIG;
-        this.SHARE_CONTEXT = workContext.SHARE_CONTEXT;
-    }
-}
-
-export class InlineQueryContext {
-    token: string;
-    query_id: string;
-    from_id: number;
-    chat_type: string | undefined;
-    query: string;
-
-    constructor(token: string, inlineQuery: Telegram.InlineQuery) {
-        this.token = token;
-        this.query_id = inlineQuery.id;
-        this.from_id = inlineQuery.from.id;
-        this.chat_type = inlineQuery.chat_type;
-        this.query = inlineQuery.query;
-    }
-}
-
-export class ChosenInlineContext {
-    token: string;
-    from_id: number;
-    query: string;
-    result_id: string;
-    inline_message_id: string;
-    constructor(token: string, choosenInlineQuery: Telegram.ChosenInlineResult) {
-        this.token = token;
-        this.from_id = choosenInlineQuery.from.id;
-        this.query = choosenInlineQuery.query;
-        this.result_id = choosenInlineQuery.result_id;
-        this.inline_message_id = choosenInlineQuery.inline_message_id || '';
-    }
-}
-
-export class ChosenInlineWorkerContext {
-    USER_CONFIG: AgentUserConfig;
-    botToken: string;
-    MIDDLE_CONTEXT: Record<string, any>;
-    SHARE_CONTEXT: Record<string, any>;
-    constructor(chosenInline: Telegram.ChosenInlineResult, token: string, USER_CONFIG: AgentUserConfig) {
-        this.USER_CONFIG = USER_CONFIG;
-        this.botToken = token;
-        // 模拟私聊消息
-        this.MIDDLE_CONTEXT = {
-            originalMessageInfo: { type: 'text' },
-        };
-        this.SHARE_CONTEXT = {
-            botName: 'AI',
-            telegraphAccessTokenKey: `telegraph_access_token:${chosenInline.from.id}`,
-        };
-    }
-
-    static async from(token: string, chosenInline: Telegram.ChosenInlineResult): Promise<ChosenInlineWorkerContext> {
-        const USER_CONFIG = { ...ENV.USER_CONFIG };
-        // Same as private chat
-        let userConfigKey = `user_config:${chosenInline.from.id}`;
-        const botId = Number.parseInt(token.split(':')[0]);
-        if (botId) {
-            userConfigKey += `:${botId}`;
-        }
-        try {
-            const userConfig: AgentUserConfig = JSON.parse(await ENV.DATABASE.get(userConfigKey));
-            ConfigMerger.merge(USER_CONFIG, ConfigMerger.trim(userConfig, ENV.LOCK_USER_CONFIG_KEYS) || {});
-            USER_CONFIG.ENABLE_SHOWINFO = ENV.INLINE_QUERY_SHOW_INFO;
-            // 过于频繁的请求不会被Telegram接受
-            ENV.TELEGRAM_MIN_STREAM_INTERVAL = ENV.INLINE_QUERY_SEND_INTERVAL;
-        } catch (e) {
-            console.warn(e);
-        }
-        return new ChosenInlineWorkerContext(chosenInline, token, USER_CONFIG);
-    }
-}
diff --git a/src/config/merger.ts b/src/config/merger.ts
@@ -38,7 +38,7 @@ export class ConfigMerger {
             if (!sourceKeys.has(key)) {
                 continue;
             }
-            if (exclude && exclude.includes(key)) {
+            if (exclude?.includes(key)) {
                 continue;
             }
             // 默认为字符串类型

diff --git a/src/telegram/handler/chat.ts b/src/telegram/handler/chat.ts
@@ -1,5 +1,6 @@
 /* eslint-disable unused-imports/no-unused-vars */
 import type { FilePart, TextPart, ToolResultPart } from 'ai';
+import type { ReadableStream as WebReadableStream } from 'node:stream/web';
 import type * as Telegram from 'telegram-bot-api-types';
 import type { ChatStreamTextHandler, HistoryModifier, ImageResult, LLMChatRequestParams } from '../../agent/types';
 import type { WorkerContext } from '../../config/context';
@@ -15,6 +16,7 @@ import { clearLog, getLog, logSingleton } from '../../log/logDecortor';
 import { log } from '../../log/logger';
 import { sendToolResult } from '../../tools';
 import { imageToBase64String } from '../../utils/image';
+import { OggToMp3Converter } from '../../utils/others/audio';
 import { createTelegramBotAPI } from '../api';
 import { escape } from '../utils/md2tgmd';
 import { MessageSender, sendAction, TelegraphSender } from '../utils/send';
@@ -139,10 +141,19 @@ export class ChatHandler implements MessageHandler<WorkerContext> {
                         });
                     }
                 } else if (type === 'audio' || type === 'voice') {
+                    const isChat = context.USER_CONFIG.AUDIO_HANDLE_TYPE === 'chat';
+                    let audioData = urls[0];
+                    if (isChat) {
+                        const response = await fetch(urls[0]);
+                        if (!response.body) {
+                            throw new Error('Failed to fetch audio data');
+                        }
+                        audioData = await new OggToMp3Converter(response.body as WebReadableStream, 'base64').convert() as string;
+                    }
                     params.content.push({
                         type: 'file',
-                        data: urls[0],
-                        mimeType: 'audio/ogg',
+                        data: audioData,
+                        mimeType: 'audio/mpeg',
                     });
                 }
             }
@@ -288,12 +299,13 @@ function workflowHandlers(type: string): WorkflowHandler {
         case 'image:text':
         case 'photo:text':
         case 'text:chat':
+        case 'chat:text':
+        case 'chat:audio':
             return handleText;
         case 'text:image':
             return handleTextToImage;
         case 'audio:text':
         case 'audio:audio':
-        case 'audio:chat':
         case 'trans:text':
         case 'trans:audio':
             return handleAudio;