From e92f9af4fcc2a0662647fa6e0608758129d16fb0 Mon Sep 17 00:00:00 2001
From: Sean Lee <xmlee97@gmail.com>
Date: Fri, 30 Aug 2024 11:43:17 +0800
Subject: [PATCH] correct the tokenizer for the chinese example

remove space
---
 README_CN.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/README_CN.md b/README_CN.md
index c63b8f6..e67c295 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -27,10 +27,16 @@ pip install baguetter
 ## 快速入门
 
 ```python
-from baguetter.indices import BMXSparseIndex
+from typing import List
+from baguetter.indices import BMXSparseIndex, TextPreprocessorConfig
+
+# 自定义中文 tokenizer
+def cjk_tokenizer(text: str) -> List[str]:
+    return list(text.replace(" ", ""))
 
 # 创建索引
-idx = BMXSparseIndex()
+idx = BMXSparseIndex(preprocessor_or_config=TextPreprocessorConfig(
+   custom_tokenizer=cjk_tokenizer))
 
 # 添加文档
 docs = [