-
Notifications
You must be signed in to change notification settings - Fork 5
/
Class.Tokenizer.php
138 lines (110 loc) · 4.2 KB
/
Class.Tokenizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
<?php
class Tokenizer{
private $vocabulary = array();
// Simple constructor
function __construct(){
// Load the vocabulary from the vocab.json file
// NOTE: Run GenerateWordTokensVocabulary.php FIRST to generate the vocab.json file
$vocab = fopen("vocab.json", "r");
$this->vocabulary = json_decode(fread($vocab, filesize("vocab.json")), 1);
fclose($vocab);
}
// Encode the string into a token array
function encode($string){
// Split the string into an array of words
$words = explode(" ", $string);
// Loop through the words and encode them
foreach($words as &$word){
// If the word isn't in the vocabulary
if(!in_array($word, array_keys($this->vocabulary))){
$word = strtolower($word); // Lowercase the word
// If the lowercase word isn't in the vocabulary
if(!in_array($word, array_keys($this->vocabulary))){
$word = -1; // Set the word to -1 because it is not in the vocabulary
}else{
$word = $this->vocabulary[$word]; // Set the word to the token value of the word in the vocabulary
}
}else{
$word = $this->vocabulary[$word]; // Get the token for the word
}
}
return $words; // Return the word tokens array
}
// Decode the token array into a string
function decode($array){
$words = array(); // Create an array to hold the words
foreach($array as $token){ // Loop through the tokens
// look up the token/word in the vocabulary
$words[] = array_search($token, $this->vocabulary);
}
return implode(" ", $words); // Return the words as a string
}
function SoftMax($array){
$max = max($array); // Get the max value
$sum = 0; // Initialize the sum
// Loop through the array
foreach($array as $value){
// Add the exponential of the value minus the max value to the sum
$sum += exp($value - $max);
}
// Loop through the array again and divide each value by the sum
return array_map(function($value) use ($sum, $max){
return exp($value - $max) / $sum;
}, $array);
}
function GetEmbedding($tokens){
// Embedding keys are $this->vocabulary array/token values
$embedding = array_fill_keys(array_values($this->vocabulary), 0);
// For each token in the tokens array
foreach($tokens as $token){
// If the token is in the vocabulary
if(array_key_exists($token, $embedding)){
$embedding[$token] += 1;
}
}
return $this->SoftMax($embedding);
}
}
/*
// Example usage:
// Include the class
require_once("Class.Tokenizer.php"); // If not in this file include the class
// Create a new tokenizer object
$tokenizer = new tokenizer();
$string = "Hello world foobar hello tree three";
// Encode the string
$encoded = $tokenizer->encode($string); // words to vocabulary token keys
// Print the encoded string
print_r($encoded);
// Array
// (
// [0] => 50195 // hello
// [1] => 98440 // world
// [2] => -1 // [foobar = UNKNOWN LEXEME]
// [3] => 50195 // hello
// [4] => 92285 // tree
// [5] => 90606 // three
// )
// Print the decoded string
$decoded = $tokenizer->decode($encoded); // Decode the string
// NOTE: 'Hello' is unknown so it becomes 'hello' which is in the vocabulary
print_r($decoded); // hello world hello tree three
$embedding = $tokenizer->GetEmbedding($encoded); // Get the embedding
// Not telling you what to do... but... you might want
// to feed the embedding into your neural network ;P
// Dont print the embedding its 99170 indexes long...
// you can print but... it's not very human readable
// Uncomment to print the embedding
//print_r($embedding);
// It looks like this:
// Array
// (
// ...
// [99165] => 1.0082419351834E-5
// [99166] => 1.0082419351834E-5
// [99167] => 1.0082419351834E-5
// [99168] => 1.0082419351834E-5
// [99169] => 1.0082419351834E-5
// [99170] => 1.0082419351834E-5
// )
*/