Skip to content

Commit

Permalink
Added support for GPT4o and o200k_base models
Browse files Browse the repository at this point in the history
This commit should add support for all OpenAI
currently available models.
  • Loading branch information
bluescreen10 committed Sep 13, 2024
1 parent 7f65fe7 commit 53141e4
Show file tree
Hide file tree
Showing 6 changed files with 200,110 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Usage of tokenizer:
## Todo

- ✅ port code
- ✅ o200k_base encoding
- ✅ cl100k_base encoding
- ✅ r50k_base encoding
- ✅ p50k_base encoding
Expand Down
23 changes: 23 additions & 0 deletions codec/o200k_base.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package codec

import "github.com/dlclark/regexp2"

func NewO200kBase() *Codec {
o200kBaseVocabOnce.Do(o200kBaseVocabInit)
return &Codec{
name: "o200k_base",
vocabulary: o200kBaseVocab,
splitRegexp: regexp2.MustCompile(
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|`+
`\p{N}{1,3}|`+` ?[^\s\p{L}\p{N}]+[\r\n/]*|`+
`\s*[\r\n]+|`+
`\s+(?!\S)|`+
`\s+`,
regexp2.None),
specialTokens: map[string]uint{
"<|endoftext|>": 199999,
"<|endofprompt|>": 200018,
},
}
}
Loading

0 comments on commit 53141e4

Please sign in to comment.