Skip to content

Commit

Permalink
feat: crop text util
Browse files Browse the repository at this point in the history
  • Loading branch information
josStorer committed Mar 12, 2023
1 parent e0dcb7f commit 5dbea17
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions src/utils/crop-text.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { maxResponseTokenLength } from '../config.mjs'
import { encode } from '@nem035/gpt-3-encoder'

// TODO add model support
export function cropText(
text,
maxLength = 3900 - maxResponseTokenLength,
startLength = 400,
endLength = 300,
tiktoken = true,
) {
let length = tiktoken ? encode(text).length : text.length
if (length <= maxLength) return text

const splits = text.split(/[,,.。??!!;;\n]/).map((s) => s.trim())
const splitsLength = splits.map((s) => (tiktoken ? encode(s).length : s.length))
length = splitsLength.reduce((sum, length) => sum + length, 0)

const cropLength = length - startLength - endLength
const cropTargetLength = maxLength - startLength - endLength
const cropPercentage = cropTargetLength / cropLength
const cropStep = Math.max(0, 1 / cropPercentage - 1)

let croppedText = ''
let currentLength = 0
let currentIndex = 0
let currentStep = 0

for (; currentIndex < splits.length; currentIndex++) {
if (currentLength + splitsLength[currentIndex] + 1 <= startLength) {
croppedText += splits[currentIndex] + ','
currentLength += splitsLength[currentIndex] + 1
} else if (currentLength + splitsLength[currentIndex] + 1 + endLength <= maxLength) {
if (currentStep < cropStep) {
currentStep++
} else {
croppedText += splits[currentIndex] + ','
currentLength += splitsLength[currentIndex] + 1
currentStep = currentStep - cropStep
}
} else {
break
}
}

let endPart = ''
let endPartLength = 0
for (let i = splits.length - 1; endPartLength + splitsLength[i] <= endLength; i--) {
endPart = splits[i] + ',' + endPart
endPartLength += splitsLength[i] + 1
}
currentLength += endPartLength
croppedText += endPart

console.log(
`maxLength: ${maxLength}\n` +
`croppedTextLength: ${tiktoken ? encode(croppedText).length : croppedText.length}\n` +
`desiredLength: ${currentLength}\n` +
`content: ${croppedText}`,
)
return croppedText
}

0 comments on commit 5dbea17

Please sign in to comment.