From e92f9af4fcc2a0662647fa6e0608758129d16fb0 Mon Sep 17 00:00:00 2001 From: Sean Lee Date: Fri, 30 Aug 2024 11:43:17 +0800 Subject: [PATCH] correct the tokenizer for the chinese example remove space --- README_CN.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README_CN.md b/README_CN.md index c63b8f6..e67c295 100644 --- a/README_CN.md +++ b/README_CN.md @@ -27,10 +27,16 @@ pip install baguetter ## 快速入门 ```python -from baguetter.indices import BMXSparseIndex +from typing import List +from baguetter.indices import BMXSparseIndex, TextPreprocessorConfig + +# 自定义中文 tokenizer +def cjk_tokenizer(text: str) -> List[str]: + return list(text.replace(" ", "")) # 创建索引 -idx = BMXSparseIndex() +idx = BMXSparseIndex(preprocessor_or_config=TextPreprocessorConfig( + custom_tokenizer=cjk_tokenizer)) # 添加文档 docs = [