-
Notifications
You must be signed in to change notification settings - Fork 30
/
pretokenizer.go
352 lines (296 loc) · 9.64 KB
/
pretokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
package tokenizer
// wrapper for subpart of NormalizedString
import (
"fmt"
"log"
// "reflect"
"github.com/sugarme/tokenizer/normalizer"
)
type PreToken struct {
Value string
Offsets []int
Tokens []Token // optional
}
// OffsetType is a enum-like possible type of offsets
type OffsetType int
const (
Byte OffsetType = iota
Char
)
// Split contains the underlying `NormalizedString` as well as
// its offsets in the original string. These offsets are in the
// `original` referential. It also contains any `Token` associated
// to the current split
type Split struct {
// Normalized is the underlying `NormalizedString`. Each SubString
// is represented by a `NormalizedString`. In the end, there might
// be many SubStrings representing various parts of the original
// input string.
normalized *normalizer.NormalizedString
// Optional Tokens associated with this split
tokens []Token
}
// NewSplit creates a new Split from a input NormalizedString
func NewSplit(normalized *normalizer.NormalizedString, tokens []Token) Split {
return Split{normalized, tokens}
}
// The `PreTokenizedString` is in charge of splitting an underlying string,
// making sure everything is fine while doing so, and providing ways to normalize
// and tokenize these splits.
//
// Once everything has been normalized and tokenized, the `PreTokenizedString` is able
// to build an `Encoding` with all the relevant offsets and word ids, relative to the
// original string.
type PreTokenizedString struct {
original string
splits []Split
}
// SplitFn takes a `NormalizedString` and returns an iterator over the
// produced `NormalizedString`.
//
// NOTE. SplitFn is free of modifying these `NormalizedString` as long as:
// The produced `NormalizedString`, if combined back together, must have
// the same `original` string as the original one given to `SplitFn`. This
// means that for the offsets tracking to work as expected, `SplitFn` must
// produce "splits" of the ORIGINAL string.
type SplitFn func(int, *normalizer.NormalizedString) []SplitIdx
// Split splits the `PreTokenizedString` by providing a `SplitFn` which is in
// charge of splitting each substring (`NormalizedString`) into multiple parts.
// func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString {
func (pt *PreTokenizedString) Split(splitFn SplitFn) *PreTokenizedString {
var newSplits []Split
for i, originalSplit := range pt.splits {
if originalSplit.tokens != nil {
newSplits = append(newSplits, originalSplit)
continue
}
for _, splitIdx := range splitFn(i, originalSplit.normalized) {
if splitIdx.Normalized.GetNormalized() != "" {
// split := NewSplit(splitIdx.Normalized, splitIdx.Tokens)
split := Split{
normalized: splitIdx.Normalized,
tokens: splitIdx.Tokens,
}
newSplits = append(newSplits, split)
}
}
}
pt.splits = newSplits
return pt
}
// Normalize normalizes all the splits that do not have attached `Tokens`,
// using the provided `normalize` function.
func (pt *PreTokenizedString) Normalize(nFn func(*normalizer.NormalizedString) *normalizer.NormalizedString) *PreTokenizedString {
var nSplits []Split
for _, split := range pt.splits {
if split.tokens == nil {
newSplit := split
newSplit.normalized = nFn(split.normalized)
nSplits = append(nSplits, newSplit)
}
}
pt.splits = nSplits
return pt
}
// Tokenize tokenizes all the splits that do not have attached `Tokens`, using the provided
// `tokenize` function
func (pt *PreTokenizedString) Tokenize(tokFn func(*normalizer.NormalizedString) ([]Token, error)) (*PreTokenizedString, error) {
var nSplits []Split
for _, split := range pt.splits {
newSplit := split
if split.tokens == nil {
toks, err := tokFn(split.normalized)
if err != nil {
return nil, err
}
newSplit.tokens = toks
}
nSplits = append(nSplits, newSplit)
}
pt.splits = nSplits
return pt, nil
}
// IntoEncoding transforms the current `PreTokenizedString` into an `Encoding`.
//
// If a `wordIdx` is provided, any word in the generated `Encoding`
// will be set to this value. This is generally used with pre-tokenized
// input, that do not need the `PreTokenizedString` to generate word ids.
//
// This method will fail if some splits do not have associated `Token`.
func (pt *PreTokenizedString) IntoEncoding(typeId int, wordIdx int, offsetType OffsetType) (*Encoding, error) {
if len(pt.splits) == 0 {
return DefaultEncoding(), nil
}
for _, s := range pt.splits {
if len(s.tokens) == 0 {
err := fmt.Errorf("Split has not been tokenized. Call 'PreTokenizeString.Tokenize()' method first.\n")
return nil, err
}
}
charMap := make(map[int]int, 0) // map[byteIdx]runeIdx
switch {
case offsetType == Char:
currRuneIdx := 0
for byteIdx, r := range pt.original {
n := 0
for i := 0; i < len([]byte(string(r))); i++ {
charMap[byteIdx+n] = currRuneIdx
n += 1
}
currRuneIdx += 1
}
case offsetType == Byte:
charMap = make(map[int]int, 0)
default:
err := fmt.Errorf("Invalid offsetType (%v).\n", offsetType)
return nil, err
}
var (
enIds []int
enTokens []string
enWords []int
enTypeIds []int
enOffsets [][]int
enSpecialTokensMask []int
enAttentionMask []int
)
for idx, split := range pt.splits {
normalized := split.normalized
offsets := normalized.OffsetsOriginal()
charMapSplit := charMap
var convertedOffsets []int
for _, tok := range split.tokens {
o := normalized.ConvertOffset(normalizer.NewRange(tok.Offsets[0], tok.Offsets[1], normalizer.NormalizedTarget))
if o == nil {
convertedOffsets = []int{offsets[0] + tok.Offsets[0], offsets[0] + tok.Offsets[1]}
} else {
convertedOffsets = []int{offsets[0] + o.Start(), offsets[0] + o.End()}
}
// Convert to char offset if relevant
start, ok := charMapSplit[convertedOffsets[0]]
if !ok {
start = -1
}
end, ok := charMapSplit[convertedOffsets[1]]
if !ok {
end = -1
}
var newConvertedOffsets []int
switch {
case start != -1 && end != -1:
newConvertedOffsets = []int{start, end}
case start != -1 && end == -1: // If we reached the end, `end` is not in the map
// But the one just before should be
last, ok := charMapSplit[convertedOffsets[1]-1]
if !ok {
log.Printf("Something wrong here. Should find from map.\n")
last = start + 1
}
newConvertedOffsets = []int{start, last}
default:
newConvertedOffsets = convertedOffsets
}
var wordIndex int = wordIdx
if wordIdx == -1 {
wordIndex = idx
}
// fmt.Printf("tok: ....%v - value: '%v'\n", tok, tok.Value)
// NOTE: we get token value from offsets on normalized.
enIds = append(enIds, tok.Id)
enTokens = append(enTokens, tok.Value)
enOffsets = append(enOffsets, newConvertedOffsets)
enWords = append(enWords, wordIndex)
enTypeIds = append(enTypeIds, typeId)
enSpecialTokensMask = append(enSpecialTokensMask, 0)
enAttentionMask = append(enAttentionMask, 1)
}
}
en := DefaultEncoding()
en.Ids = enIds
en.Tokens = enTokens
en.Offsets = enOffsets
en.Words = enWords
en.TypeIds = enTypeIds
en.SpecialTokenMask = enSpecialTokensMask
en.AttentionMask = enAttentionMask
return en, nil
}
// GetSplits returns a list of splits, each of them being a slice of the normalized
// string, the associated offsets either in original or normalized
// referential, as well as the potention tokens
func (pt *PreTokenizedString) GetSplits(offsetRef normalizer.IndexOn, offsetType OffsetType) []PreToken {
var preToks []PreToken
var offsetConverter OffsetConverter
if offsetType == Char {
offsetConverter = NewBytesToCharOffsetConverter(pt.original)
}
offset := 0
for _, s := range pt.splits {
var offsets []int
switch {
case offsetRef == normalizer.OriginalTarget:
offsets = s.normalized.OffsetsOriginal()
case offsetRef == normalizer.NormalizedTarget:
length := s.normalized.Len()
offset += length
offsets = []int{offset - length, offset}
}
// Convert to char offsets if relevant
if offsetConverter != nil {
var err error
offsets, err = offsetConverter.Convert(offsets)
if err != nil {
panic(err)
}
}
preToks = append(preToks, PreToken{s.normalized.GetNormalized(), offsets, s.tokens})
}
return preToks
}
// NewNormalizedStringFromNS creates a PreTokenizedString from input
// NormalizedString
func NewPreTokenizedStringFromNS(n *normalizer.NormalizedString) *PreTokenizedString {
return &PreTokenizedString{
original: n.GetOriginal(),
splits: []Split{{normalized: n, tokens: nil}},
}
}
// NewPreTokenizedString create a new PreTokenizedString from input string
func NewPreTokenizedString(s string) *PreTokenizedString {
n := normalizer.NewNormalizedFrom(s)
return NewPreTokenizedStringFromNS(n)
}
type OffsetConverter interface {
Convert(offsets []int) ([]int, error)
}
type BytesToCharOffsetConverter struct {
b2c map[int]int // map of byteIndex to character(rune) index
}
func NewBytesToCharOffsetConverter(sequence string) *BytesToCharOffsetConverter {
chars := []rune(sequence) // utf-8
b2c := make(map[int]int)
n := 0
for charIdx, char := range chars {
nbytes := len([]byte(string(char)))
for i := 0; i < nbytes; i++ {
byteIdx := n + i
b2c[byteIdx] = charIdx
}
n += nbytes
}
return &BytesToCharOffsetConverter{b2c}
}
// Convert converts byte-indexed offsets to character-index offsets.
func (c *BytesToCharOffsetConverter) Convert(offsets []int) ([]int, error) {
start, ok := c.b2c[offsets[0]]
if !ok {
err := fmt.Errorf("Invalid offsets start %v\n", offsets[0])
return nil, err
}
end, ok := c.b2c[offsets[1]]
if !ok {
err := fmt.Errorf("Invalid offsets end %v\n", offsets[1])
return nil, err
}
return []int{start, end}, nil
}