diff --git a/src/llama.cpp b/src/llama.cpp index 8ab94d9ef926b..2879a5348bc4c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15468,6 +15468,15 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: regex_exprs = {"\\w+|[^\\w\\s]+"}; break; + default: + // default regex for BPE tokenization pre-processing + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }; + break; } }