Skip to content

Commit

Permalink
feat: added o200k_base to encodings and configured it's specialTokens
Browse files Browse the repository at this point in the history
  • Loading branch information
mbukeRepo committed May 16, 2024
1 parent 13157df commit 2a9da2b
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 4 deletions.
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"GPT-3",
"GPT-3.5",
"GPT-4",
"GPT-4o",
"NLP",
"Natural Language Processing",
"Text Generation",
Expand Down Expand Up @@ -77,11 +78,12 @@
"build": "yarn build:cjs && yarn build:esm && yarn build:umd",
"build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json",
"build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json",
"build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base",
"build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base && yarn build:umd:o200k_base",
"build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_cl100k_base' --env 'filename=cl100k_base.js'",
"build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_base' --env 'filename=p50k_base.js'",
"build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_edit' --env 'filename=p50k_edit.js'",
"build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_r50k_base' --env 'filename=r50k_base.js'",
"build:umd:o200k_base": "beemo webpack --entry='./src/encoding/o200k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_o200k_base' --env 'filename=o200k_base.js'",
"clean": "git clean -dfX --exclude=node_modules src && beemo typescript:sync-project-refs",
"format": "yarn rrun prettier --write \"./{src,tests,.config}/**/!(*.d).{.js,jsx,ts,tsx,json,md}\"",
"postinstallDev": "yarn prepare",
Expand Down
18 changes: 17 additions & 1 deletion src/GptEncoding.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ const sharedResults = {
}

const results = {
o200k_base: {
space: [220],
tab: [197],
'This is some text': [2_028, 374, 1_063, 1_495],
indivisible: [485, 344, 23_936],
'hello 👋 world 🌍': [15_339, 62_904, 233, 1_917, 11_410, 234, 235],
decodedHelloWorldTokens: ['hello', ' ', '👋', ' world', ' ', '🌍'],
'toString constructor hasOwnProperty valueOf': [
6_712, 4_797, 706, 19_964, 907, 2_173,
],
'hello, I am a text, and I have commas. a,b,c': [
15_339, 11, 358, 1_097, 264, 1_495, 11, 323, 358, 617, 77_702, 13, 264,
8_568, 10_317,
],
},
cl100k_base: {
space: [220],
tab: [197],
Expand Down Expand Up @@ -111,7 +126,7 @@ describe.each(encodingNames)('%s', (encodingName: EncodingName) => {
it('decode token-by-token via generator', () => {
const str = 'hello 👋 world 🌍'
const generator = decodeGenerator(result[str])
result.decodedHelloWorldTokens.forEach((token) => {
result.decodedHelloWorldTokens.forEach((token: string) => {
expect(generator.next().value).toBe(token)
})
})
Expand Down Expand Up @@ -243,6 +258,7 @@ function loadTestPlans() {
p50k_base: [],
p50k_edit: [],
r50k_base: [],
o200k_base: [],
}
testPlanData.split('\n\n').forEach((testPlan) => {
const [encodingNameLine, sampleLine, encodedLine] = testPlan.split('\n')
Expand Down
32 changes: 32 additions & 0 deletions src/encoding/o200k_base.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/* eslint-disable import/extensions */
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'
import encoder from '../encodings/o200k_base.js'
import { GptEncoding } from '../GptEncoding.js'

export * from '../specialTokens.js'

const api = GptEncoding.getEncodingApi('o200k_base', () =>
convertTokenBytePairEncodingFromTuples(encoder),
)
const {
decode,
decodeAsyncGenerator,
decodeGenerator,
encode,
encodeGenerator,
isWithinTokenLimit,
encodeChat,
encodeChatGenerator,
} = api
export {
decode,
decodeAsyncGenerator,
decodeGenerator,
encode,
encodeChat,
encodeChatGenerator,
encodeGenerator,
isWithinTokenLimit,
}
// eslint-disable-next-line import/no-default-export
export default api
2 changes: 1 addition & 1 deletion src/mapping.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ const internalChatModelParams = {
'gpt-4o': {
messageSeparator: '',
roleSeparator: ImSep,
}
},
}

export const chatModelParams: Partial<Record<ModelName, ChatParameters>> =
Expand Down
2 changes: 1 addition & 1 deletion src/model/gpt-4o.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ export {
isWithinTokenLimit,
}
// eslint-disable-next-line import/no-default-export
export default api
export default api
23 changes: 23 additions & 0 deletions src/modelParams.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,26 @@ function Cl100KBase(mergeableBytePairRanks: EncoderMap): EncodingParams {
}
}

function O200KBase(mergeableBytePairRanks: EncoderMap): EncodingParams {
const specialTokenMapping = new Map<string, number>([
[EndOfText, 199_999],
[FimPrefix, 200_000],
[FimMiddle, 200_001],
[FimSuffix, 200_002],
[ImStart, 200_003],
[ImEnd, 200_004],
[ImSep, 200_005],
[EndOfPrompt, 200_006],
])

return {
tokenSplitRegex:
/(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu,
mergeableBytePairRanks,
specialTokenMapping,
}
}

export type GetMergeableRanksFn = (encodingName: EncodingName) => EncoderMap
export type GetMergeableRanksAsyncFn = (
encodingName: EncodingName,
Expand All @@ -109,6 +129,9 @@ export function getEncodingParams(
case 'cl100k_base':
return Cl100KBase(mergeableBytePairRanks)

case 'o200k_base':
return O200KBase(mergeableBytePairRanks)

default:
throw new Error(`Unknown encoding name: ${encodingName}`)
}
Expand Down
3 changes: 3 additions & 0 deletions src/resolveEncoding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import type { EncoderMap } from './EncoderMap.js'
import cl100k from './encodings/cl100k_base.js'
import p50k from './encodings/p50k_base.js'
import r50k from './encodings/r50k_base.js'
import o200k from './encodings/o200k_base.js'
import type { EncodingName } from './mapping.js'

export const resolveEncoding = (encoding: EncodingName): EncoderMap => {
Expand All @@ -15,6 +16,8 @@ export const resolveEncoding = (encoding: EncodingName): EncoderMap => {
return convertTokenBytePairEncodingFromTuples(p50k)
case 'cl100k_base':
return convertTokenBytePairEncodingFromTuples(cl100k)
case 'o200k_base':
return convertTokenBytePairEncodingFromTuples(o200k)
default: {
throw new Error(`Unknown encoding name: ${encoding}`)
}
Expand Down
6 changes: 6 additions & 0 deletions src/resolveEncodingAsync.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ export const resolveEncodingAsync = async (
({ default: encodingTuples }) => encodingTuples,
),
)
case 'o200k_base':
return convertTokenBytePairEncodingFromTuples(
await import('./encodings/o200k_base.js').then(
({ default: encodingTuples }) => encodingTuples,
),
)
default: {
throw new Error(`Unknown encoding name: ${encoding}`)
}
Expand Down

0 comments on commit 2a9da2b

Please sign in to comment.