From a8a64fd0733ff0c4ec6c52348bc5292b72000f6a Mon Sep 17 00:00:00 2001
From: Joan Martinez <joan.fontanals.martinez@jina.ai>
Date: Thu, 6 Jun 2024 10:15:07 +0200
Subject: [PATCH] fix: fix preprocessing jina v2 zh

---
 llama.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 20ac0f9168674..aaf22944c1854 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13038,13 +13038,17 @@ struct llm_tokenizer_bpe {
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
-                        //TODO: Apply GPT2 + lowercasing
+                        //TODO: Apply lowercase + whitespace pretokenization
                         {
                             std::string lowercase_text = text;
                             std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
-                            word_collection = unicode_regex_split(lowercase_text, {
-                                "",
-                            });
+                            std::regex regexPattern("\\w+|[^\\w\\s]+");
+                            std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
+                            std::sregex_token_iterator end;
+
+                            while (it != end) {
+                                word_collection.push_back(*it++);
+                            }
                         }
                         break;
                     default:
@@ -13153,10 +13157,9 @@ struct llm_tokenizer_bpe {
                     for (auto j = str.begin(); j != str.end(); ++j) {
                         std::string byte_str(1, *j);
                         auto token_multibyte = vocab.token_to_id.find(byte_str);
-                        if (token_multibyte == vocab.token_to_id.end()) {
-                            throw std::runtime_error("ERROR: byte not found in vocab");
+                        if (token_multibyte != vocab.token_to_id.end()) {
+                            output.push_back((*token_multibyte).second);
                         }
-                        output.push_back((*token_multibyte).second);
                     }
                 } else {
                     output.push_back((*token).second);