From d88461bc16183a5be307f9dc403a37f4265385b7 Mon Sep 17 00:00:00 2001
From: AfryMask <AfryMask@163.com>
Date: Sat, 15 Apr 2023 01:35:03 +0800
Subject: [PATCH] whisper : fix the bug related to word splitting errors in the
 "tokenize" function. (#760)

Co-authored-by: AfryMask <afrymask@gmail.com>
---
 whisper.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/whisper.cpp b/whisper.cpp
index 1e69da059ce..846d3a93dbe 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2449,25 +2449,20 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
         int n = word.size();
         while (i < n) {
             int j = n;
+            bool found = false;
             while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
+                auto sub = word.substr(i, j-i);
+                auto it = vocab.token_to_id.find(sub);
                 if (it != vocab.token_to_id.end()) {
                     tokens.push_back(it->second);
                     i = j;
+                    found = true;
                     break;
                 }
                 --j;
             }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
+            if (!found) {
+                fprintf(stderr, "unknown token \n");
                 ++i;
             }
         }