Skip to content

Commit

Permalink
whisper : fix the bug related to word splitting errors in the "tokeni…
Browse files Browse the repository at this point in the history
…ze" function. (ggerganov#760)

Co-authored-by: AfryMask <afrymask@gmail.com>
  • Loading branch information
AfryMask and AfryMask authored Apr 14, 2023
1 parent 51840fe commit d88461b
Showing 1 changed file with 6 additions and 11 deletions.
17 changes: 6 additions & 11 deletions whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
int n = word.size();
while (i < n) {
int j = n;
bool found = false;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
auto sub = word.substr(i, j-i);
auto it = vocab.token_to_id.find(sub);
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
found = true;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
if (!found) {
fprintf(stderr, "unknown token \n");
++i;
}
}
Expand Down

0 comments on commit d88461b

Please sign in to comment.