From afd76e62547001d7d2bff9227d6d7a2875f301df Mon Sep 17 00:00:00 2001 From: Joan Martinez Date: Mon, 8 Jul 2024 15:40:27 +0200 Subject: [PATCH] fix: handle default --- src/llama.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 8ab94d9ef926b..2879a5348bc4c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15468,6 +15468,15 @@ struct llm_tokenizer_bpe { case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: regex_exprs = {"\\w+|[^\\w\\s]+"}; break; + default: + // default regex for BPE tokenization pre-processing + regex_exprs = { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }; + break; } }