Skip to content

Commit

Permalink
fix: fix preprocessing jina v2 zh
Browse files Browse the repository at this point in the history
  • Loading branch information
JoanFM committed Jun 6, 2024
1 parent d86efa6 commit a8a64fd
Showing 1 changed file with 10 additions and 7 deletions.
17 changes: 10 additions & 7 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13038,13 +13038,17 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
//TODO: Apply GPT2 + lowercasing
//TODO: Apply lowercase + whitespace pretokenization
{
std::string lowercase_text = text;
std::transform(lowercase_text.begin(), lowercase_text.end(), lowercase_text.begin(), [](unsigned char c){ return std::tolower(c); });
word_collection = unicode_regex_split(lowercase_text, {
"",
});
std::regex regexPattern("\\w+|[^\\w\\s]+");
std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern);
std::sregex_token_iterator end;

while (it != end) {
word_collection.push_back(*it++);
}
}
break;
default:
Expand Down Expand Up @@ -13153,10 +13157,9 @@ struct llm_tokenizer_bpe {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) {
throw std::runtime_error("ERROR: byte not found in vocab");
if (token_multibyte != vocab.token_to_id.end()) {
output.push_back((*token_multibyte).second);
}
output.push_back((*token_multibyte).second);
}
} else {
output.push_back((*token).second);
Expand Down

0 comments on commit a8a64fd

Please sign in to comment.