Skip to content

Commit

Permalink
fix: handle default
Browse files Browse the repository at this point in the history
  • Loading branch information
JoanFM committed Jul 8, 2024
1 parent 0699a4c commit afd76e6
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15468,6 +15468,15 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
regex_exprs = {"\\w+|[^\\w\\s]+"};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
};
break;
}
}

Expand Down

0 comments on commit afd76e6

Please sign in to comment.