Skip to content

Commit

Permalink
feat: add a LRU BPE merge cache
Browse files Browse the repository at this point in the history
defaults to 100k items, but can be disabled or increased for higher performance

fixes #68
  • Loading branch information
niieani committed Dec 9, 2024
1 parent 15009fa commit 15d13b1
Show file tree
Hide file tree
Showing 48 changed files with 482 additions and 61 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,10 @@ const tokenLimit = 10
const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
```

### `countTokens(text: string | Iterable<ChatMessage>): number`
### `countTokens(text: string | Iterable<ChatMessage>, encodeOptions?: EncodeOptions): number`

Counts the number of tokens in the input text or chat. Use this method when you need to determine the number of tokens without checking against a limit.
The optional `encodeOptions` parameter allows you to specify custom sets of allowed or disallowed special tokens.

Example:

Expand Down Expand Up @@ -370,6 +371,28 @@ const encoded = encode(inputText, undefined, disallowedSpecial)

In this example, an Error is thrown, because the input text contains a disallowed special token.

## Performance Optimization

### LRU Merge Cache

The tokenizer uses an LRU (Least Recently Used) cache to improve encoding performance for similar strings. By default, it stores up to 100,000 merged token pairs. You can adjust this value to optimize for your specific use case:

- Increasing the cache size will make encoding similar strings faster but consume more memory
- Setting it to 0 will disable caching completely
- For applications processing many unique strings, a smaller cache might be more efficient

You can modify the cache size using the `setMergeCacheSize` function:

```ts
import { setMergeCacheSize } from 'gpt-tokenizer'

// Set to 5000 entries
setMergeCacheSize(5000)

// Disable caching completely
setMergeCacheSize(0)
```

## Testing and Validation

`gpt-tokenizer` includes a set of test cases in the [TestPlans.txt](./data/TestPlans.txt) file to ensure its compatibility with OpenAI's Python `tiktoken` library. These test cases validate the functionality and behavior of `gpt-tokenizer`, providing a reliable reference for developers.
Expand Down
75 changes: 65 additions & 10 deletions benchmark/src/benchmarkRunner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ const calculateAverage = (numbers: number[]): number => {
return numbers.reduce((a, b) => a + b, 0) / numbers.length
}

// Define the number of executions for performance testing
const EXECUTIONS = 10000

// Define the number of iterations for averaging
const ITERATIONS = 3
const ITERATIONS = 1

// Define the execution multiplier for performance testing
const EXECUTIONS_MULTIPLIER = 1

// Function to run a single benchmark iteration in a child process
const runSingleBenchmark = (
tokenizerIndex: number,
executions: number,
executionsMultiplier: number,
): Promise<BenchmarkResult> => {
return new Promise((resolve, reject) => {
const workerPath = path.resolve(__dirname, 'benchmarkWorker.js')
Expand All @@ -36,7 +36,10 @@ const runSingleBenchmark = (
reject(new Error('Failed to spawn child process'))
return
}
const message: WorkerInput = { tokenizerIndex, executions }
const message: WorkerInput = {
tokenizerIndex,
executionsMultiplier,
}
child.send(message)
child.on('message', (msg: any) => {
// Changed to any to avoid TypeScript issues
Expand Down Expand Up @@ -117,12 +120,17 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
label: 'Encode Avg (ms)',
better: 'lower' as const,
precision: 4,
}, // Increased precision
},
decodeTimeAvg: {
label: 'Decode Avg (ms)',
better: 'lower' as const,
precision: 4,
}, // Increased precision
},
countTokensTimeAvg: {
label: 'Count Tokens Avg (ms)',
better: 'lower' as const,
precision: 4,
},
memoryIncrease: {
label: 'Memory Increase (MB)',
better: 'lower' as const,
Expand All @@ -146,6 +154,8 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
return r.datasetsAverage?.encodeTimeMs || 0
case 'decodeTimeAvg':
return r.datasetsAverage?.decodeTimeMs || 0
case 'countTokensTimeAvg':
return r.datasetsAverage?.countTimeMs || 0
case 'memoryIncrease':
return r.memoryChangeAfterRunMb
default:
Expand All @@ -166,6 +176,7 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
chalk.green('Init\nMem RSS'),
chalk.yellow('Encode\nAvg (ms)'),
chalk.yellow('Decode\nAvg (ms)'),
chalk.yellow('Count\nAvg (ms)'),
chalk.red('Memory\nIncrease'),
chalk.red('Mem\nLeak?'),
],
Expand Down Expand Up @@ -238,6 +249,13 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
changes.decodeTimeMs,
),
)
row.push(
applyHighlight(
res.datasetsAverage?.countTimeMs || 0,
'countTokensTimeAvg',
changes.countTimeMs,
),
)
row.push(
applyHighlight(
res.memoryChangeAfterRunMb,
Expand Down Expand Up @@ -277,7 +295,10 @@ const runBenchmarks = async (
for (let i = 0; i < ITERATIONS; i++) {
console.log(` ${chalk.yellow(`Iteration ${i + 1}/${ITERATIONS}`)}`)
try {
const result = await runSingleBenchmark(tokenizerIndex, EXECUTIONS)
const result = await runSingleBenchmark(
tokenizerIndex,
EXECUTIONS_MULTIPLIER,
)
tokenizerResults.push(result)
} catch (error) {
console.error(
Expand Down Expand Up @@ -317,6 +338,9 @@ const runBenchmarks = async (
const decodeTimes = tokenizerResults.map(
(r) => r.datasets[dataset].decode.averageTimeMs,
)
const countTimes = tokenizerResults.map(
(r) => r.datasets[dataset].countTokens.averageTimeMs,
)
const memoryChanges = tokenizerResults.map(
(r) => r.datasets[dataset].memoryChangeAfterExecutionsMb,
)
Expand All @@ -327,6 +351,9 @@ const runBenchmarks = async (
decode: {
averageTimeMs: calculateAverage(decodeTimes),
},
countTokens: {
averageTimeMs: calculateAverage(countTimes),
},
memoryChangeAfterExecutionsMb: calculateAverage(memoryChanges),
}
}
Expand All @@ -344,6 +371,13 @@ const runBenchmarks = async (
Object.values(r.datasets).map((d) => d.decode.averageTimeMs),
),
),
countTimeMs: calculateAverage(
tokenizerResults.flatMap((r) =>
Object.values(r.datasets).map(
(d) => d.countTokens?.averageTimeMs || 0,
),
),
),
},
})
}
Expand Down Expand Up @@ -406,7 +440,10 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
for (let i = 0; i < ITERATIONS; i++) {
console.log(` ${chalk.yellow(`Iteration ${i + 1}/${ITERATIONS}`)}`)
try {
const result = await runSingleBenchmark(tokenizerIndex, EXECUTIONS)
const result = await runSingleBenchmark(
tokenizerIndex,
EXECUTIONS_MULTIPLIER,
)
tokenizerResults.push(result)
} catch (error) {
console.error(
Expand Down Expand Up @@ -447,6 +484,13 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
Object.values(r.datasets).map((d) => d.decode.averageTimeMs),
),
),
countTimeMs: calculateAverage(
tokenizerResults.flatMap((r) =>
Object.values(r.datasets).map(
(d) => d.countTokens.averageTimeMs,
),
),
),
},
}
// Aggregate per-dataset results
Expand All @@ -458,6 +502,9 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
const decodeTimes = tokenizerResults.map(
(r) => r.datasets[dataset].decode.averageTimeMs,
)
const countTimes = tokenizerResults.map(
(r) => r.datasets[dataset].countTokens.averageTimeMs,
)
const memoryChanges = tokenizerResults.map(
(r) => r.datasets[dataset].memoryChangeAfterExecutionsMb,
)
Expand All @@ -468,6 +515,9 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
decode: {
averageTimeMs: calculateAverage(decodeTimes),
},
countTokens: {
averageTimeMs: calculateAverage(countTimes),
},
memoryChangeAfterExecutionsMb: calculateAverage(memoryChanges),
}
}
Expand Down Expand Up @@ -506,6 +556,11 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
(lastResult.datasetsAverage?.decodeTimeMs || 0)) /
(lastResult.datasetsAverage?.decodeTimeMs || 1)) *
100,
countTimeMs:
(((newAggregated.datasetsAverage?.countTimeMs || 0) -
(lastResult.datasetsAverage?.countTimeMs || 0)) /
(lastResult.datasetsAverage?.countTimeMs || 1)) *
100,
memoryChangeAfterRunMb:
((newAggregated.memoryChangeAfterRunMb -
lastResult.memoryChangeAfterRunMb) /
Expand Down
91 changes: 73 additions & 18 deletions benchmark/src/benchmarkWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { memoryUsage } from 'process'
import { tokenizers } from './tokenizers.js'

const runWorker = async (message: WorkerInput) => {
const { tokenizerIndex, executions } = message
const { tokenizerIndex, executionsMultiplier } = message
const tokenizer = tokenizers[tokenizerIndex]
const result: BenchmarkResult = {
packageName: tokenizer.packageName,
Expand All @@ -19,10 +19,10 @@ const runWorker = async (message: WorkerInput) => {
datasets: {},
memoryChangeAfterRunMb: 0,
memoryLeakWarning: false,
datasetsAverage: { encodeTimeMs: 0, decodeTimeMs: 0 },
datasetsAverage: { encodeTimeMs: 0, decodeTimeMs: 0, countTimeMs: 0 },
}
const encodeTimes: number[] = new Array(executions)
const decodeTimes: number[] = new Array(executions)
const testData = Object.entries(datasets)

try {
const initMemoryUsageBefore = memoryUsage()
const initStart = performance.now()
Expand All @@ -46,38 +46,66 @@ const runWorker = async (message: WorkerInput) => {
}

// Prepare datasets
const testData = Object.entries(datasets)
for (const [name, text] of testData) {
// Warm-up encode and decode
for (const [name, data] of testData) {
// Calculate actual execution counts
const encodeExecs = Math.max(
1,
Math.round(data.encodeExecutionsCount * executionsMultiplier),
)
const decodeExecs = Math.max(
1,
Math.round(data.decodeExecutionsCount * executionsMultiplier),
)
const countExecs = Math.max(
1,
Math.round(data.countTokensExecutionsCount * executionsMultiplier),
)

// Warm-up encode, decode and countTokens (using 5% of execution count)
let encodedTokens: number[] | Uint8Array = []
for (let i = 0; i < 50; i++) {
encodedTokens = tokenizerModule.encode(text)
const warmUpCount = Math.max(1, Math.round(encodeExecs * 0.05))
for (let i = 0; i < warmUpCount; i++) {
encodedTokens = tokenizerModule.encode(data.text)
tokenizerModule.decode(encodedTokens)
tokenizerModule.countTokens(data.text)
}

// Encode benchmark
for (let i = 0; i < executions; i++) {
const encodeTimes: number[] = new Array(encodeExecs)
for (let i = 0; i < encodeExecs; i++) {
const start = performance.now()
encodedTokens = tokenizerModule.encode(text)
encodedTokens = tokenizerModule.encode(data.text)
const end = performance.now()
encodeTimes[i] = end - start
}
const avgEncodeTime = encodeTimes.reduce((a, b) => a + b, 0) / executions
const avgEncodeTime = encodeTimes.reduce((a, b) => a + b, 0) / encodeExecs

// Decode benchmark
const decodeTimes: number[] = new Array(decodeExecs)
let decodedText: string = ''
for (let i = 0; i < executions; i++) {
for (let i = 0; i < decodeExecs; i++) {
const start = performance.now()
decodedText = tokenizerModule.decode(encodedTokens)
const end = performance.now()
decodeTimes[i] = end - start
}
const avgDecodeTime = decodeTimes.reduce((a, b) => a + b, 0) / executions
const avgDecodeTime = decodeTimes.reduce((a, b) => a + b, 0) / decodeExecs

// Count tokens benchmark
const countTokensTimes: number[] = new Array(countExecs)
for (let i = 0; i < countExecs; i++) {
const start = performance.now()
tokenizerModule.countTokens(data.text)
const end = performance.now()
countTokensTimes[i] = end - start
}
const avgCountTokensTime =
countTokensTimes.reduce((a, b) => a + b, 0) / countExecs

// Verify correctness
if (decodedText !== text) {
if (decodedText !== data.text) {
console.warn(
`Warning: Decoded text does not match original for dataset ${name}. \nExpected:\n${text}\nGot:\n${decodedText}`,
`Warning: Decoded text does not match original for dataset ${name}. \nExpected:\n${data.text}\nGot:\n${decodedText}`,
)
}

Expand All @@ -97,20 +125,44 @@ const runWorker = async (message: WorkerInput) => {
decode: {
averageTimeMs: parseFloat(avgDecodeTime.toFixed(4)),
},
countTokens: {
averageTimeMs: parseFloat(avgCountTokensTime.toFixed(4)),
},
memoryChangeAfterExecutionsMb: parseFloat(
(memoryUsed / 1024 / 1024).toFixed(2),
),
}
}

// Calculate dataset averages
const datasetCount = Object.keys(result.datasets).length
const encodeTimeSum = Object.values(result.datasets).reduce(
(sum, dataset) => sum + dataset.encode.averageTimeMs,
0,
)
const decodeTimeSum = Object.values(result.datasets).reduce(
(sum, dataset) => sum + dataset.decode.averageTimeMs,
0,
)
const countTimeSum = Object.values(result.datasets).reduce(
(sum, dataset) => sum + dataset.countTokens.averageTimeMs,
0,
)

result.datasetsAverage = {
encodeTimeMs: parseFloat((encodeTimeSum / datasetCount).toFixed(4)),
decodeTimeMs: parseFloat((decodeTimeSum / datasetCount).toFixed(4)),
countTimeMs: parseFloat((countTimeSum / datasetCount).toFixed(4)),
}

// Overall memory leak detection
const finalMemoryUsage = memoryUsage()
const totalMemoryIncrease =
finalMemoryUsage.heapUsed - initMemoryUsageAfter.heapUsed
result.memoryChangeAfterRunMb = parseFloat(
(totalMemoryIncrease / 1024 / 1024).toFixed(2),
)
result.memoryLeakWarning = totalMemoryIncrease > 1 * 1024 * 1024 // 1 MB threshold
result.memoryLeakWarning = totalMemoryIncrease > 10 * 1024 * 1024 // 10 MB threshold

// Send the result back to the parent process
const output: WorkerOutput = {
Expand All @@ -130,7 +182,10 @@ const runWorker = async (message: WorkerInput) => {
}

if (process.argv.length > 2) {
runWorker({ executions: 100000, tokenizerIndex: tokenizers.length - 1 })
runWorker({
executionsMultiplier: 1,
tokenizerIndex: tokenizers.length - 1,
})
} else {
process.on('message', runWorker)
}
Loading

0 comments on commit 15d13b1

Please sign in to comment.