feat: use better limitTranscriptByteLength logic

JimmyLv · Mar 8, 2023 · 673e421 · 673e421 · vercel · Mar 8, 2023
1 parent 246d04c
commit 673e421
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 102 deletions.
diff --git a/lib/openai/getSmallSizeTranscripts.ts b/lib/openai/getSmallSizeTranscripts.ts
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 Kazuki Nakayashiki.
+// Modified work: Copyright (c) 2023 Qixiang Zhu.
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+
+// via https://github.com/lxfater/BilibiliSummary/blob/3d1a67cbe8e96adba60672b778ce89644a43280d/src/prompt.ts#L62
+export function limitTranscriptByteLength(str: string, byteLimit: number = LIMIT_COUNT) {
+  const utf8str = unescape(encodeURIComponent(str));
+  const byteLength = utf8str.length;
+  if (byteLength > byteLimit) {
+    const ratio = byteLimit / byteLength;
+    const newStr = str.substring(0, Math.floor(str.length * ratio));
+    return newStr;
+  }
+  return str;
+}
+function filterHalfRandomly<T>(arr: T[]): T[] {
+  const filteredArr: T[] = [];
+  const halfLength = Math.floor(arr.length / 2);
+  const indicesToFilter = new Set<number>();
+
+  // 随机生成要过滤掉的元素的下标
+  while (indicesToFilter.size < halfLength) {
+    const index = Math.floor(Math.random() * arr.length);
+    if (!indicesToFilter.has(index)) {
+      indicesToFilter.add(index);
+    }
+  }
+
+  // 过滤掉要过滤的元素
+  for (let i = 0; i < arr.length; i++) {
+    if (!indicesToFilter.has(i)) {
+      filteredArr.push(arr[i]);
+    }
+  }
+
+  return filteredArr;
+}
+function getByteLength(text: string) {
+  return unescape(encodeURIComponent(text)).length;
+}
+
+function itemInIt(textData: SubtitleItem[], text: string): boolean {
+  return textData.find(t => t.text === text) !== undefined;
+}
+
+type SubtitleItem = {
+  text: string;
+  index: number;
+}
+
+// Seems like 15,000 bytes is the limit for the prompt
+const LIMIT_COUNT = 7000; // 1000 is a buffer
+export function getSmallSizeTranscripts(newTextData: SubtitleItem[], oldTextData: SubtitleItem[], byteLimit: number = LIMIT_COUNT): string {
+  const text = newTextData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
+  const byteLength = getByteLength(text);
+
+  if (byteLength > byteLimit) {
+    const filtedData = filterHalfRandomly(newTextData);
+    return getSmallSizeTranscripts(filtedData, oldTextData, byteLimit);
+  }
+
+  let resultData = newTextData.slice();
+  let resultText = text;
+  let lastByteLength = byteLength;
+
+  for (let i = 0; i < oldTextData.length; i++) {
+    const obj = oldTextData[i];
+    if (itemInIt(newTextData, obj.text)) {
+      continue;
+    }
+
+    const nextTextByteLength = getByteLength(obj.text);
+    const isOverLimit = lastByteLength + nextTextByteLength > byteLimit;
+    if (isOverLimit) {
+      const overRate = (lastByteLength + nextTextByteLength - byteLimit) / nextTextByteLength;
+      const chunkedText = obj.text.substring(0, Math.floor(obj.text.length * overRate));
+      resultData.push({ text: chunkedText, index: obj.index });
+    } else {
+      resultData.push(obj);
+    }
+    resultText = resultData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
+    lastByteLength = getByteLength(resultText);
+  }
+
+  return resultText;
+}
+
+
diff --git a/lib/openai/prompt.ts b/lib/openai/prompt.ts
@@ -1,118 +1,47 @@
+import { limitTranscriptByteLength } from "~/lib/openai/getSmallSizeTranscripts";
+
 interface PromptConfig {
-  language?: string
-  sentenceCount?: string
-  shouldShowTimestamp?: boolean
+  language?: string;
+  sentenceCount?: string;
+  shouldShowTimestamp?: boolean;
 }
 const PROMPT_LANGUAGE_MAP = {
-  'English': "UK English",
-  "中文": "Simplified Chinese",
-  "繁體中文": "Traditional Chinese",
-  "日本語": "Japanese",
-  "Italiano": "Italian",
-  "Deutsch": "German",
-  "Español": "Spanish",
-  "Français": "French",
-  "Nederlands": "Dutch",
-  "한국어": "Korean",
-  "ភាសាខ្មែរ":"Khmer",
-  "हिंदी" : "Hindi"
-}
+  English: "UK English",
+  中文: "Simplified Chinese",
+  繁體中文: "Traditional Chinese",
+  日本語: "Japanese",
+  Italiano: "Italian",
+  Deutsch: "German",
+  Español: "Spanish",
+  Français: "French",
+  Nederlands: "Dutch",
+  한국어: "Korean",
+  ភាសាខ្មែរ: "Khmer",
+  हिंदी: "Hindi",
+};
 
 export function getSystemPrompt(promptConfig: PromptConfig) {
   // [gpt-3-youtube-summarizer/main.py at main · tfukaza/gpt-3-youtube-summarizer](https://github.com/tfukaza/gpt-3-youtube-summarizer/blob/main/main.py)
-  console.log('prompt config: ', promptConfig);
-  const { language = '中文', sentenceCount = '5', shouldShowTimestamp } = promptConfig
+  console.log("prompt config: ", promptConfig);
+  const {
+    language = "中文",
+    sentenceCount = "5",
+    shouldShowTimestamp,
+  } = promptConfig;
   // @ts-ignore
-  const enLanguage = PROMPT_LANGUAGE_MAP[language]
+  const enLanguage = PROMPT_LANGUAGE_MAP[language];
   // 我希望你是一名专业的视频内容编辑，帮我用${language}总结视频的内容精华。请你将视频字幕文本进行总结（字幕中可能有错别字，如果你发现了错别字请改正），然后以无序列表的方式返回，不要超过5条。记得不要重复句子，确保所有的句子都足够精简，清晰完整，祝你好运！
-  const betterPrompt = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please summarize the video subtitles (there may be typos in the subtitles, please correct them) and return them in an unordered list format. Please do not exceed ${sentenceCount} items, and make sure not to repeat any sentences and all sentences are concise, clear, and complete. Good luck!`
+  const betterPrompt = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please summarize the video subtitles (there may be typos in the subtitles, please correct them) and return them in an unordered list format. Please do not exceed ${sentenceCount} items, and make sure not to repeat any sentences and all sentences are concise, clear, and complete. Good luck!`;
   // const timestamp = ' ' //`（类似 10:24）`;
   // 我希望你是一名专业的视频内容编辑，帮我用${language}总结视频的内容精华。请先用一句简短的话总结视频梗概。然后再请你将视频字幕文本进行总结（字幕中可能有错别字，如果你发现了错别字请改正），在每句话的最前面加上时间戳${timestamp}，每句话开头只需要一个开始时间。请你以无序列表的方式返回，请注意不要超过5条哦，确保所有的句子都足够精简，清晰完整，祝你好运！
   const promptWithTimestamp = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please start by summarizing the whole video in one short sentence. Then, please summarize the video subtitles in an unordered list format, you should add the start timestamp (e.g. 12.4 -) at the beginning of each sentence so that students can jump to the source of the video. Please make sure not to exceed ${sentenceCount} items and all sentences are concise, clear, and complete. Good luck!`;
 
-  return shouldShowTimestamp ? promptWithTimestamp : betterPrompt
+  return shouldShowTimestamp ? promptWithTimestamp : betterPrompt;
 }
 export function getUserSubtitlePrompt(title: string, transcript: any) {
   return `标题: "${title
     ?.replace(/\n+/g, " ")
-    .trim()}"\n视频字幕: "${truncateTranscript(transcript)
+    .trim()}"\n视频字幕: "${limitTranscriptByteLength(transcript)
     .replace(/\n+/g, " ")
     .trim()}"`;
-  }
-
-  // Seems like 15,000 bytes is the limit for the prompt
-  const limit = 7000; // 1000 is a buffer
-
-// todo: update to getSmallSizeTranscripts https://github.com/lxfater/BilibiliSummary/blob/3d1a67cbe8e96adba60672b778ce89644a43280d/src/prompt.ts#L62
-  export function getChunckedTranscripts(textData: { text: any; index: any; }[], textDataOriginal: any[]) {
-
-    // [Thought Process]
-    // (1) If text is longer than limit, then split it into chunks (even numbered chunks)
-    // (2) Repeat until it's under limit
-    // (3) Then, try to fill the remaining space with some text
-    // (eg. 15,000 => 7,500 is too much chuncked, so fill the rest with some text)
-
-    let result = "";
-    const text = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
-    const bytes = textToBinaryString(text).length;
-
-    if (bytes > limit) {
-      // Get only even numbered chunks from textArr
-      const evenTextData = textData.filter((t, i) => i % 2 === 0);
-      result = getChunckedTranscripts(evenTextData, textDataOriginal);
-    } else {
-      // Check if any array items can be added to result to make it under limit but really close to it
-      if (textDataOriginal.length !== textData.length) {
-        textDataOriginal.forEach((obj, i) => {
-
-          if (textData.some(t => t.text === obj.text)) { return; }
-
-          textData.push(obj);
-
-          const newText = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
-          const newBytes = textToBinaryString(newText).length;
-
-          if (newBytes < limit) {
-
-            const nextText = textDataOriginal[i + 1];
-            const nextTextBytes = textToBinaryString(nextText.text).length;
-
-            if (newBytes + nextTextBytes > limit) {
-              const overRate = ((newBytes + nextTextBytes) - limit) / nextTextBytes;
-              const chunkedText = nextText.text.substring(0, Math.floor(nextText.text.length * overRate));
-              textData.push({ text: chunkedText, index: nextText.index });
-              result = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
-            } else {
-              result = newText;
-            }
-          }
-
-        })
-      } else {
-        result = text;
-      }
-    }
-
-    const originalText = textDataOriginal.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
-    return (result == "") ? originalText : result; // Just in case the result is empty
-
-  }
-
-  function truncateTranscript(str:string) {
-    const bytes = textToBinaryString(str).length;
-    if (bytes > limit) {
-      const ratio = limit / bytes;
-      const newStr = str.substring(0, str.length * ratio);
-      return newStr;
-    }
-    return str;
-  }
-
-  function textToBinaryString(str:string) {
-    let escstr = decodeURIComponent(encodeURIComponent(escape(str)));
-    let binstr = escstr.replace(/%([0-9A-F]{2})/gi, function (match, hex) {
-      let i = parseInt(hex, 16);
-      return String.fromCharCode(i);
-    });
-    return binstr;
-  }
+}
diff --git a/pages/api/sumup.ts b/pages/api/sumup.ts
@@ -3,7 +3,8 @@ import type { NextFetchEvent, NextRequest } from "next/server";
 import { NextResponse } from "next/server";
 import { fetchSubtitle } from "~/lib/fetchSubtitle";
 import { ChatGPTAgent, fetchOpenAIResult } from "~/lib/openai/fetchOpenAIResult";
-import { getChunckedTranscripts, getSystemPrompt, getUserSubtitlePrompt } from "~/lib/openai/prompt";
+import { getSmallSizeTranscripts } from "~/lib/openai/getSmallSizeTranscripts";
+import { getSystemPrompt, getUserSubtitlePrompt } from "~/lib/openai/prompt";
 import { selectApiKeyAndActivatedLicenseKey } from "~/lib/openai/selectApiKeyAndActivatedLicenseKey";
 import { SummarizeParams } from "~/lib/types";
 import { isDev } from "~/utils/env";
@@ -37,7 +38,7 @@ export default async function handler(
     return new Response("No subtitle in the video", { status: 501 });
   }
   const inputText = subtitlesArray
-    ? getChunckedTranscripts(subtitlesArray, subtitlesArray)
+    ? getSmallSizeTranscripts(subtitlesArray, subtitlesArray)
     : descriptionText;
   const systemPrompt = getSystemPrompt({
     shouldShowTimestamp: subtitlesArray ? shouldShowTimestamp : false,