Skip to content

Commit

Permalink
feat: use better limitTranscriptByteLength logic
Browse files Browse the repository at this point in the history
  • Loading branch information
JimmyLv committed Mar 8, 2023
1 parent 246d04c commit 673e421
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 102 deletions.
96 changes: 96 additions & 0 deletions lib/openai/getSmallSizeTranscripts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright (c) 2022 Kazuki Nakayashiki.
// Modified work: Copyright (c) 2023 Qixiang Zhu.
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// via https://github.com/lxfater/BilibiliSummary/blob/3d1a67cbe8e96adba60672b778ce89644a43280d/src/prompt.ts#L62
export function limitTranscriptByteLength(str: string, byteLimit: number = LIMIT_COUNT) {
const utf8str = unescape(encodeURIComponent(str));
const byteLength = utf8str.length;
if (byteLength > byteLimit) {
const ratio = byteLimit / byteLength;
const newStr = str.substring(0, Math.floor(str.length * ratio));
return newStr;
}
return str;
}
function filterHalfRandomly<T>(arr: T[]): T[] {
const filteredArr: T[] = [];
const halfLength = Math.floor(arr.length / 2);
const indicesToFilter = new Set<number>();

// 随机生成要过滤掉的元素的下标
while (indicesToFilter.size < halfLength) {
const index = Math.floor(Math.random() * arr.length);
if (!indicesToFilter.has(index)) {
indicesToFilter.add(index);
}
}

// 过滤掉要过滤的元素
for (let i = 0; i < arr.length; i++) {
if (!indicesToFilter.has(i)) {
filteredArr.push(arr[i]);
}
}

return filteredArr;
}
function getByteLength(text: string) {
return unescape(encodeURIComponent(text)).length;
}

function itemInIt(textData: SubtitleItem[], text: string): boolean {
return textData.find(t => t.text === text) !== undefined;
}

type SubtitleItem = {
text: string;
index: number;
}

// Seems like 15,000 bytes is the limit for the prompt
const LIMIT_COUNT = 7000; // 1000 is a buffer
export function getSmallSizeTranscripts(newTextData: SubtitleItem[], oldTextData: SubtitleItem[], byteLimit: number = LIMIT_COUNT): string {
const text = newTextData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
const byteLength = getByteLength(text);

if (byteLength > byteLimit) {
const filtedData = filterHalfRandomly(newTextData);
return getSmallSizeTranscripts(filtedData, oldTextData, byteLimit);
}

let resultData = newTextData.slice();
let resultText = text;
let lastByteLength = byteLength;

for (let i = 0; i < oldTextData.length; i++) {
const obj = oldTextData[i];
if (itemInIt(newTextData, obj.text)) {
continue;
}

const nextTextByteLength = getByteLength(obj.text);
const isOverLimit = lastByteLength + nextTextByteLength > byteLimit;
if (isOverLimit) {
const overRate = (lastByteLength + nextTextByteLength - byteLimit) / nextTextByteLength;
const chunkedText = obj.text.substring(0, Math.floor(obj.text.length * overRate));
resultData.push({ text: chunkedText, index: obj.index });
} else {
resultData.push(obj);
}
resultText = resultData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
lastByteLength = getByteLength(resultText);
}

return resultText;
}


129 changes: 29 additions & 100 deletions lib/openai/prompt.ts
Original file line number Diff line number Diff line change
@@ -1,118 +1,47 @@
import { limitTranscriptByteLength } from "~/lib/openai/getSmallSizeTranscripts";

interface PromptConfig {
language?: string
sentenceCount?: string
shouldShowTimestamp?: boolean
language?: string;
sentenceCount?: string;
shouldShowTimestamp?: boolean;
}
const PROMPT_LANGUAGE_MAP = {
'English': "UK English",
"中文": "Simplified Chinese",
"繁體中文": "Traditional Chinese",
"日本語": "Japanese",
"Italiano": "Italian",
"Deutsch": "German",
"Español": "Spanish",
"Français": "French",
"Nederlands": "Dutch",
"한국어": "Korean",
"ភាសាខ្មែរ":"Khmer",
"हिंदी" : "Hindi"
}
English: "UK English",
中文: "Simplified Chinese",
繁體中文: "Traditional Chinese",
日本語: "Japanese",
Italiano: "Italian",
Deutsch: "German",
Español: "Spanish",
Français: "French",
Nederlands: "Dutch",
한국어: "Korean",
ភាសាខ្មែរ: "Khmer",
हिंदी: "Hindi",
};

export function getSystemPrompt(promptConfig: PromptConfig) {
// [gpt-3-youtube-summarizer/main.py at main · tfukaza/gpt-3-youtube-summarizer](https://github.com/tfukaza/gpt-3-youtube-summarizer/blob/main/main.py)
console.log('prompt config: ', promptConfig);
const { language = '中文', sentenceCount = '5', shouldShowTimestamp } = promptConfig
console.log("prompt config: ", promptConfig);
const {
language = "中文",
sentenceCount = "5",
shouldShowTimestamp,
} = promptConfig;
// @ts-ignore
const enLanguage = PROMPT_LANGUAGE_MAP[language]
const enLanguage = PROMPT_LANGUAGE_MAP[language];
// 我希望你是一名专业的视频内容编辑,帮我用${language}总结视频的内容精华。请你将视频字幕文本进行总结(字幕中可能有错别字,如果你发现了错别字请改正),然后以无序列表的方式返回,不要超过5条。记得不要重复句子,确保所有的句子都足够精简,清晰完整,祝你好运!
const betterPrompt = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please summarize the video subtitles (there may be typos in the subtitles, please correct them) and return them in an unordered list format. Please do not exceed ${sentenceCount} items, and make sure not to repeat any sentences and all sentences are concise, clear, and complete. Good luck!`
const betterPrompt = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please summarize the video subtitles (there may be typos in the subtitles, please correct them) and return them in an unordered list format. Please do not exceed ${sentenceCount} items, and make sure not to repeat any sentences and all sentences are concise, clear, and complete. Good luck!`;
// const timestamp = ' ' //`(类似 10:24)`;
// 我希望你是一名专业的视频内容编辑,帮我用${language}总结视频的内容精华。请先用一句简短的话总结视频梗概。然后再请你将视频字幕文本进行总结(字幕中可能有错别字,如果你发现了错别字请改正),在每句话的最前面加上时间戳${timestamp},每句话开头只需要一个开始时间。请你以无序列表的方式返回,请注意不要超过5条哦,确保所有的句子都足够精简,清晰完整,祝你好运!
const promptWithTimestamp = `I want you to act as an educational content creator. You will help students summarize the essence of the video in ${enLanguage}. Please start by summarizing the whole video in one short sentence. Then, please summarize the video subtitles in an unordered list format, you should add the start timestamp (e.g. 12.4 -) at the beginning of each sentence so that students can jump to the source of the video. Please make sure not to exceed ${sentenceCount} items and all sentences are concise, clear, and complete. Good luck!`;

return shouldShowTimestamp ? promptWithTimestamp : betterPrompt
return shouldShowTimestamp ? promptWithTimestamp : betterPrompt;
}
export function getUserSubtitlePrompt(title: string, transcript: any) {
return `标题: "${title
?.replace(/\n+/g, " ")
.trim()}"\n视频字幕: "${truncateTranscript(transcript)
.trim()}"\n视频字幕: "${limitTranscriptByteLength(transcript)
.replace(/\n+/g, " ")
.trim()}"`;
}

// Seems like 15,000 bytes is the limit for the prompt
const limit = 7000; // 1000 is a buffer

// todo: update to getSmallSizeTranscripts https://github.com/lxfater/BilibiliSummary/blob/3d1a67cbe8e96adba60672b778ce89644a43280d/src/prompt.ts#L62
export function getChunckedTranscripts(textData: { text: any; index: any; }[], textDataOriginal: any[]) {

// [Thought Process]
// (1) If text is longer than limit, then split it into chunks (even numbered chunks)
// (2) Repeat until it's under limit
// (3) Then, try to fill the remaining space with some text
// (eg. 15,000 => 7,500 is too much chuncked, so fill the rest with some text)

let result = "";
const text = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
const bytes = textToBinaryString(text).length;

if (bytes > limit) {
// Get only even numbered chunks from textArr
const evenTextData = textData.filter((t, i) => i % 2 === 0);
result = getChunckedTranscripts(evenTextData, textDataOriginal);
} else {
// Check if any array items can be added to result to make it under limit but really close to it
if (textDataOriginal.length !== textData.length) {
textDataOriginal.forEach((obj, i) => {

if (textData.some(t => t.text === obj.text)) { return; }

textData.push(obj);

const newText = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
const newBytes = textToBinaryString(newText).length;

if (newBytes < limit) {

const nextText = textDataOriginal[i + 1];
const nextTextBytes = textToBinaryString(nextText.text).length;

if (newBytes + nextTextBytes > limit) {
const overRate = ((newBytes + nextTextBytes) - limit) / nextTextBytes;
const chunkedText = nextText.text.substring(0, Math.floor(nextText.text.length * overRate));
textData.push({ text: chunkedText, index: nextText.index });
result = textData.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
} else {
result = newText;
}
}

})
} else {
result = text;
}
}

const originalText = textDataOriginal.sort((a, b) => a.index - b.index).map(t => t.text).join(" ");
return (result == "") ? originalText : result; // Just in case the result is empty

}

function truncateTranscript(str:string) {
const bytes = textToBinaryString(str).length;
if (bytes > limit) {
const ratio = limit / bytes;
const newStr = str.substring(0, str.length * ratio);
return newStr;
}
return str;
}

function textToBinaryString(str:string) {
let escstr = decodeURIComponent(encodeURIComponent(escape(str)));
let binstr = escstr.replace(/%([0-9A-F]{2})/gi, function (match, hex) {
let i = parseInt(hex, 16);
return String.fromCharCode(i);
});
return binstr;
}
}
5 changes: 3 additions & 2 deletions pages/api/sumup.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import type { NextFetchEvent, NextRequest } from "next/server";
import { NextResponse } from "next/server";
import { fetchSubtitle } from "~/lib/fetchSubtitle";
import { ChatGPTAgent, fetchOpenAIResult } from "~/lib/openai/fetchOpenAIResult";
import { getChunckedTranscripts, getSystemPrompt, getUserSubtitlePrompt } from "~/lib/openai/prompt";
import { getSmallSizeTranscripts } from "~/lib/openai/getSmallSizeTranscripts";
import { getSystemPrompt, getUserSubtitlePrompt } from "~/lib/openai/prompt";
import { selectApiKeyAndActivatedLicenseKey } from "~/lib/openai/selectApiKeyAndActivatedLicenseKey";
import { SummarizeParams } from "~/lib/types";
import { isDev } from "~/utils/env";
Expand Down Expand Up @@ -37,7 +38,7 @@ export default async function handler(
return new Response("No subtitle in the video", { status: 501 });
}
const inputText = subtitlesArray
? getChunckedTranscripts(subtitlesArray, subtitlesArray)
? getSmallSizeTranscripts(subtitlesArray, subtitlesArray)
: descriptionText;
const systemPrompt = getSystemPrompt({
shouldShowTimestamp: subtitlesArray ? shouldShowTimestamp : false,
Expand Down

1 comment on commit 673e421

@vercel
Copy link

@vercel vercel bot commented on 673e421 Mar 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.