-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTrainingTokenizerExample.swift
30 lines (23 loc) · 1.03 KB
/
TrainingTokenizerExample.swift
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import Tokenizers
@main
public struct TrainingTokenizerExample {
public static func main() throws {
let tokenizer = try Tokenizer(model: BPE(unkToken: "[UNK]"))
let trainer = try BPETrainer(specialTokens: [
"[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]",
])
tokenizer.preTokenizer = Whitespace()
// Train the tokenizer.
let files = ["test", "train", "valid"].map { "../data/wikitext-103-raw/wiki.\($0).raw" }
try tokenizer.train(files: files, trainer: trainer)
// Save the tokenizer in one file.
try tokenizer.save(to: "../data/tokenizer-wiki.json")
// Reload the tokenizer
print("Reload the trained tokenizer...")
let savedTokenizer = try Tokenizer(contentsOfFile: "../data/tokenizer-wiki.json")
// Using the tokenizer
let output = try savedTokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.tokens)
// => ["Hello", ",", "y", "\'", "all", "!", "How", "are", "you", "[UNK]", "?"]
}
}