From d88461bc16183a5be307f9dc403a37f4265385b7 Mon Sep 17 00:00:00 2001 From: AfryMask Date: Sat, 15 Apr 2023 01:35:03 +0800 Subject: [PATCH] whisper : fix the bug related to word splitting errors in the "tokenize" function. (#760) Co-authored-by: AfryMask --- whisper.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 1e69da059ce..846d3a93dbe 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2449,25 +2449,20 @@ static std::vector tokenize(const whisper_vocab & vocab, cons int n = word.size(); while (i < n) { int j = n; + bool found = false; while (j > i) { - auto it = vocab.token_to_id.find(word.substr(i, j-i)); + auto sub = word.substr(i, j-i); + auto it = vocab.token_to_id.find(sub); if (it != vocab.token_to_id.end()) { tokens.push_back(it->second); i = j; + found = true; break; } --j; } - if (i == n) { - break; - } - if (j == i) { - auto sub = word.substr(i, 1); - if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { - tokens.push_back(vocab.token_to_id.at(sub)); - } else { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); - } + if (!found) { + fprintf(stderr, "unknown token \n"); ++i; } }