From 4cb9540c99616c159a1673854a011512e4d2801d Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Mon, 19 Aug 2019 19:34:56 +0800
Subject: [PATCH 1/8] feat: add Chinese language

just at the begining of adding Chinese
---
 .gitignore              |  5 ++---
 Cargo.toml              |  4 +++-
 examples/export_json.rs | 29 +++++++++++++++++---------
 examples/out_zh.json    | 45 +++++++++++++++++++++++++++++++++++++++++
 src/lang/mod.rs         | 13 +++++++++++-
 src/lang/zh.rs          | 24 ++++++++++++++++++++++
 src/lib.rs              | 27 ++++++++++++++++++++++++-
 7 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 examples/out_zh.json
 create mode 100644 src/lang/zh.rs

diff --git a/.gitignore b/.gitignore
index 32891a1..2152445 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,8 +8,7 @@ Cargo.lock
 
 # These are backup files generated by rustfmt
 **/*.rs.bk
-
 examples/out.json
 out.json
-
-**/node_modules/
\ No newline at end of file
+**/node_modules/
+.idea/
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index c703ac7..cceed35 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,13 +28,14 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)]
 serde_json = "1"
 strum = "0.15"
 strum_macros = "0.15"
+jieba-rs = "0.4.10"
 
 [features]
 default = ["languages"]
 nightly = ["bench"]
 bench = []
 
-languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr"]
+languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr", "zh"]
 da = ["rust-stemmers"]
 de = ["rust-stemmers"]
 du = ["rust-stemmers"]
@@ -47,3 +48,4 @@ ro = ["rust-stemmers"]
 ru = ["rust-stemmers"]
 sv = ["rust-stemmers"]
 tr = ["rust-stemmers"]
+zh = ["rust-stemmers"]
diff --git a/examples/export_json.rs b/examples/export_json.rs
index 9c8cd44..8740369 100644
--- a/examples/export_json.rs
+++ b/examples/export_json.rs
@@ -1,25 +1,34 @@
 extern crate elasticlunr;
 
 use elasticlunr::Index;
+use elasticlunr::Language;
 use std::fs::File;
 use std::io::Write;
 
 fn main() {
-    let mut index = Index::new(&["title", "body"]);
+    let mut index = Index::with_language(Language::Chinese, &["title", "body"], );
     index.add_doc(
         "1",
         &[
-            "This Week in Rust 207",
-            "Hello and welcome to another issue of This Week in Rust!",
-        ],
-    );
-    index.add_doc(
-        "2",
-        &[
-            "This Week in Rust 206",
-            "Hello and welcome to another issue of This Week in Rust!",
+            "中华人民共和国",
+            "杭州余杭区人民欢迎你"
         ],
     );
+
+//    index.add_doc(
+//        "1",
+//        &[
+//            "This Week in Rust 207",
+//            "Hello and welcome to another issue of This Week in Rust!",
+//        ],
+//    );
+//    index.add_doc(
+//        "2",
+//        &[
+//            "This Week in Rust 206",
+//            "Hello and welcome to another issue of This Week in Rust!",
+//        ],
+//    );
     let mut file = File::create("examples/out.json").unwrap();
     file.write_all(index.to_json_pretty().as_bytes()).unwrap();
 }
diff --git a/examples/out_zh.json b/examples/out_zh.json
new file mode 100644
index 0000000..110e222
--- /dev/null
+++ b/examples/out_zh.json
@@ -0,0 +1,45 @@
+{
+  "fields": [
+    "title",
+    "body"
+  ],
+  "pipeline": [
+    "trimmer-zh",
+    "stopWordFilter-zh",
+    "stemmer-zh"
+  ],
+  "ref": "id",
+  "version": "0.9.5",
+  "index": {
+    "body": {
+      "root": {
+        "docs": {},
+        "df": 0
+      }
+    },
+    "title": {
+      "root": {
+        "docs": {},
+        "df": 0
+      }
+    }
+  },
+  "documentStore": {
+    "save": true,
+    "docs": {
+      "1": {
+        "body": "杭州余杭区人民欢迎你",
+        "id": "1",
+        "title": "中华人民共和国"
+      }
+    },
+    "docInfo": {
+      "1": {
+        "body": 0,
+        "title": 0
+      }
+    },
+    "length": 1
+  },
+  "lang": "Chinese"
+}
\ No newline at end of file
diff --git a/src/lang/mod.rs b/src/lang/mod.rs
index 9edd84a..fc31ddb 100644
--- a/src/lang/mod.rs
+++ b/src/lang/mod.rs
@@ -54,7 +54,7 @@ macro_rules! make_stemmer {
 }
 
 /// Used to configure the `Index` for a specific lanugage.
-#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)]
+#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter, Serialize, Deserialize)]
 pub enum Language {
     English,
     #[cfg(feature = "da")]
@@ -81,6 +81,8 @@ pub enum Language {
     Swedish,
     #[cfg(feature = "tr")]
     Turkish,
+    #[cfg(feature = "zh")]
+    Chinese,
     #[doc(hidden)]
     #[strum(disabled = "true")]
     __NonExhaustive,
@@ -123,6 +125,8 @@ impl Language {
             "sv" => Some(Language::Swedish),
             #[cfg(feature = "tr")]
             "tr" => Some(Language::Turkish),
+            #[cfg(feature = "zh")]
+            "zh" => Some(Language::Chinese),
             _ => None,
         }
     }
@@ -162,6 +166,8 @@ impl Language {
             Language::Swedish => "sv",
             #[cfg(feature = "tr")]
             Language::Turkish => "tr",
+            #[cfg(feature = "zh")]
+            Language::Chinese => "zh",
             _ => panic!("Don't use the __NonExhaustive variant!"),
         }
     }
@@ -194,11 +200,14 @@ impl Language {
             Language::Swedish => ::lang::sv::make_pipeline(),
             #[cfg(feature = "tr")]
             Language::Turkish => ::lang::tr::make_pipeline(),
+            #[cfg(feature = "zh")]
+            Language::Chinese => ::lang::zh::make_pipeline(),
             _ => panic!("Dont use the `__NonExhaustive` variant!"),
         }
     }
 }
 
+
 pub mod en;
 
 #[cfg(feature = "da")]
@@ -225,3 +234,5 @@ pub mod ru;
 pub mod sv;
 #[cfg(feature = "tr")]
 pub mod tr;
+#[cfg(feature = "zh")]
+pub mod zh;
\ No newline at end of file
diff --git a/src/lang/zh.rs b/src/lang/zh.rs
new file mode 100644
index 0000000..c07a4ca
--- /dev/null
+++ b/src/lang/zh.rs
@@ -0,0 +1,24 @@
+use pipeline::Pipeline;
+
+
+pub fn make_pipeline() -> Pipeline {
+    Pipeline {
+        queue: vec![
+            ("trimmer-zh".into(), trimmer),
+            ("stopWordFilter-zh".into(), stop_word_filter),
+            ("stemmer-zh".into(), stemmer),
+        ],
+    }
+}
+
+pub fn trimmer(token: String) -> Option<String> {
+    Some(token)
+}
+
+make_stop_word_filter!([
+    ""
+]);
+
+fn stemmer(token: String) -> Option<String> {
+    Some(token)
+}
diff --git a/src/lib.rs b/src/lib.rs
index 7858da6..4217ba7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,10 @@ extern crate rust_stemmers;
 #[macro_use]
 extern crate maplit;
 
+#[cfg(feature = "zh")]
+extern crate jieba_rs;
+
+
 /// The version of elasticlunr.js this library was designed for.
 pub const ELASTICLUNR_VERSION: &str = "0.9.5";
 
@@ -59,6 +63,7 @@ use document_store::DocumentStore;
 use inverted_index::InvertedIndex;
 pub use lang::Language;
 pub use pipeline::Pipeline;
+use jieba_rs::Jieba;
 
 /// A builder for an `Index` with custom parameters.
 ///
@@ -149,6 +154,7 @@ impl IndexBuilder {
             document_store: DocumentStore::new(self.save),
             pipeline: self.pipeline.unwrap_or_default(),
             version: ::ELASTICLUNR_VERSION,
+            lang: Language::English,
         }
     }
 }
@@ -165,6 +171,7 @@ pub struct Index {
     pub version: &'static str,
     index: BTreeMap<String, InvertedIndex>,
     pub document_store: DocumentStore,
+    lang: Language,
 }
 
 impl Index {
@@ -226,6 +233,7 @@ impl Index {
             ref_field: "id".into(),
             version: ::ELASTICLUNR_VERSION,
             document_store: DocumentStore::new(true),
+            lang: lang,
         }
     }
 
@@ -256,7 +264,23 @@ impl Index {
                 continue;
             }
 
-            let tokens = self.pipeline.run(pipeline::tokenize(value.as_ref()));
+            let raw_tokens: Vec<String>;
+
+            if self.lang == Language::Chinese {
+                let jieba = Jieba::new();
+                raw_tokens = jieba.cut_for_search(value.as_ref(), false)
+                    .iter()
+                    .map(|s| (*s).into())
+                    .collect();
+
+                println!("raw tokens: {:?}", raw_tokens);
+            } else {
+                raw_tokens = pipeline::tokenize(value.as_ref());
+            }
+
+            let tokens = self.pipeline.run(raw_tokens);
+            println!("tokens: {:?}", tokens);
+
             self.document_store
                 .add_field_length(doc_ref, field, tokens.len());
 
@@ -266,6 +290,7 @@ impl Index {
 
             for (token, count) in &token_freq {
                 let freq = (*count as f64).sqrt();
+                println!("token={}, freq={}", token, freq);
                 self.index
                     .get_mut(field)
                     .expect(&format!("InvertedIndex does not exist for field {}", field))

From ca26173f37cf5eb1153078105f26a71ea3b4f421 Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Mon, 19 Aug 2019 20:36:33 +0800
Subject: [PATCH 2/8] feat: add Chinese language

add Chinese trimmer
---
 examples/export_json.rs |  2 +-
 src/lang/zh.rs          | 38 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/examples/export_json.rs b/examples/export_json.rs
index 8740369..54aba29 100644
--- a/examples/export_json.rs
+++ b/examples/export_json.rs
@@ -10,7 +10,7 @@ fn main() {
     index.add_doc(
         "1",
         &[
-            "中华人民共和国",
+            "中华人民，共和国, hello word",
             "杭州余杭区人民欢迎你"
         ],
     );
diff --git a/src/lang/zh.rs b/src/lang/zh.rs
index c07a4ca..7f06117 100644
--- a/src/lang/zh.rs
+++ b/src/lang/zh.rs
@@ -11,14 +11,48 @@ pub fn make_pipeline() -> Pipeline {
     }
 }
 
+
 pub fn trimmer(token: String) -> Option<String> {
-    Some(token)
+    println!("trim {}", token);
+
+    for c in token.chars() {
+        println!("{}, {}", c, c as u32);
+    }
+
+    let ret: String = token.
+        trim_matches(|c: char| !is_valid_char(c)  )
+        .into();
+
+    println!("end trim{}", ret);
+
+    if ret.eq("") {
+        return None;
+    }
+
+    Some(ret)
 }
 
 make_stop_word_filter!([
-    ""
+    "的", "了"
 ]);
 
 fn stemmer(token: String) -> Option<String> {
     Some(token)
 }
+
+fn is_valid_char(c: char) -> bool {
+    let min_max_list = [
+        [19668, 40869], // min and max Chinese char
+        ['a' as u32, 'z' as u32],
+        ['A' as u32, 'Z' as u32]
+    ];
+
+    let c = c as u32;
+    for min_max in min_max_list.iter() {
+        if c >= min_max[0] && c <= min_max[1] {
+            return true;
+        }
+    }
+
+    false
+}
\ No newline at end of file

From f6d814b30dcea9a14f53260a937f4bd84f9f7947 Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Tue, 20 Aug 2019 11:15:05 +0800
Subject: [PATCH 3/8] style: use match instead of if to executing different
 tokenizing function by Language

---
 src/lib.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 4217ba7..fea53a2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -266,16 +266,20 @@ impl Index {
 
             let raw_tokens: Vec<String>;
 
-            if self.lang == Language::Chinese {
-                let jieba = Jieba::new();
-                raw_tokens = jieba.cut_for_search(value.as_ref(), false)
-                    .iter()
-                    .map(|s| (*s).into())
-                    .collect();
-
-                println!("raw tokens: {:?}", raw_tokens);
-            } else {
-                raw_tokens = pipeline::tokenize(value.as_ref());
+            match self.lang {
+                Language::Chinese => {
+                    let jieba = Jieba::new();
+
+                    raw_tokens = jieba.cut_for_search(value.as_ref(), false)
+                        .iter()
+                        .map(|s| (*s).into())
+                        .collect();
+
+                    println!("raw tokens: {:?}", raw_tokens);
+                },
+                _ => {
+                    raw_tokens = pipeline::tokenize(value.as_ref());
+                }
             }
 
             let tokens = self.pipeline.run(raw_tokens);

From 7f1d9df992b7256f6edf7886f79c605c5ebbf176 Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Tue, 20 Aug 2019 14:52:48 +0800
Subject: [PATCH 4/8] refactor: remote debugging println!

---
 src/lang/zh.rs | 8 --------
 src/lib.rs     | 5 +----
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/lang/zh.rs b/src/lang/zh.rs
index 7f06117..5d29384 100644
--- a/src/lang/zh.rs
+++ b/src/lang/zh.rs
@@ -13,18 +13,10 @@ pub fn make_pipeline() -> Pipeline {
 
 
 pub fn trimmer(token: String) -> Option<String> {
-    println!("trim {}", token);
-
-    for c in token.chars() {
-        println!("{}, {}", c, c as u32);
-    }
-
     let ret: String = token.
         trim_matches(|c: char| !is_valid_char(c)  )
         .into();
 
-    println!("end trim{}", ret);
-
     if ret.eq("") {
         return None;
     }
diff --git a/src/lib.rs b/src/lib.rs
index fea53a2..2cd593c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -274,8 +274,6 @@ impl Index {
                         .iter()
                         .map(|s| (*s).into())
                         .collect();
-
-                    println!("raw tokens: {:?}", raw_tokens);
                 },
                 _ => {
                     raw_tokens = pipeline::tokenize(value.as_ref());
@@ -283,7 +281,6 @@ impl Index {
             }
 
             let tokens = self.pipeline.run(raw_tokens);
-            println!("tokens: {:?}", tokens);
 
             self.document_store
                 .add_field_length(doc_ref, field, tokens.len());
@@ -294,7 +291,7 @@ impl Index {
 
             for (token, count) in &token_freq {
                 let freq = (*count as f64).sqrt();
-                println!("token={}, freq={}", token, freq);
+
                 self.index
                     .get_mut(field)
                     .expect(&format!("InvertedIndex does not exist for field {}", field))

From fc3cb0dcdc8dc2b4201cfb13fb8212f59a1739fc Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Tue, 15 Oct 2019 14:25:45 +0800
Subject: [PATCH 5/8] feat: add Chinese tokenizing function and pipeline

---
 src/lib.rs                     |   8 +-
 src/pipeline.rs                |  11 ++
 tests/data/zh.in.txt           |   1 +
 tests/data/zh.out.txt          | 253 +++++++++++++++++++++++++++++++++
 tests/searchindex_fixture.json |   1 +
 tests/test-compare.rs          |   3 +
 tests/test-lang.rs             |   8 +-
 7 files changed, 276 insertions(+), 9 deletions(-)
 create mode 100644 tests/data/zh.in.txt
 create mode 100644 tests/data/zh.out.txt

diff --git a/src/lib.rs b/src/lib.rs
index 2cd593c..e13440b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,7 +63,6 @@ use document_store::DocumentStore;
 use inverted_index::InvertedIndex;
 pub use lang::Language;
 pub use pipeline::Pipeline;
-use jieba_rs::Jieba;
 
 /// A builder for an `Index` with custom parameters.
 ///
@@ -268,12 +267,7 @@ impl Index {
 
             match self.lang {
                 Language::Chinese => {
-                    let jieba = Jieba::new();
-
-                    raw_tokens = jieba.cut_for_search(value.as_ref(), false)
-                        .iter()
-                        .map(|s| (*s).into())
-                        .collect();
+                    raw_tokens = pipeline::tokenize_chinese(value.as_ref());
                 },
                 _ => {
                     raw_tokens = pipeline::tokenize(value.as_ref());
diff --git a/src/pipeline.rs b/src/pipeline.rs
index 0864fac..a0f99d9 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -1,7 +1,9 @@
 //! Defines the pipeline which processes text for inclusion in the index. Most users do not need
 //! to use this module directly.
 
+
 use serde::ser::{Serialize, SerializeSeq, Serializer};
+use jieba_rs::Jieba;
 
 /// Splits a text string into a vector of individual tokens.
 pub fn tokenize(text: &str) -> Vec<String> {
@@ -11,6 +13,15 @@ pub fn tokenize(text: &str) -> Vec<String> {
         .collect()
 }
 
+pub fn tokenize_chinese(text: &str) -> Vec<String> {
+    let jieba = Jieba::new();
+
+    jieba.cut_for_search(text.as_ref(), false)
+        .iter()
+        .map(|s| (*s).into())
+        .collect()
+}
+
 /// The function type used for each step in a pipeline.
 pub type PipelineFn = fn(String) -> Option<String>;
 
diff --git a/tests/data/zh.in.txt b/tests/data/zh.in.txt
new file mode 100644
index 0000000..f95aa96
--- /dev/null
+++ b/tests/data/zh.in.txt
@@ -0,0 +1 @@
+这条法国邮船白拉日隆子爵号（VicomtedeBragelonne）正向中国开来。早晨八点多钟，冲洗过的三等舱甲板湿意未干，但已坐满了人，法国人、德国流亡出来的犹太人、印度人、安南人，不用说还有中国人。海风里早含着燥热，胖人身体给炎风吹干了，上一层汗结的盐霜，仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨，人的兴致还没给太阳晒萎，烘懒，说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人，正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过，法国公使大使的特点，就是一句外国话不会讲；这几位警察并不懂德文，居然传情达意，引得犹太女人格格地笑，比他们的外交官强多了。这女人的漂亮丈夫，在旁顾而乐之，因为他几天来，香烟、啤酒、柠檬水沾光了不少。红海已过，不怕热极引火，所以等一会甲板上零星果皮、纸片、瓶塞之外，香烟头定又遍处皆是。法国人的思想是有名的清楚，他的文章也明白干净，但是他的做事，无不混乱、肮脏、喧哗，但看这船上的乱糟糟。这船，倚仗人的机巧，载满人的扰攘，寄满人的希望，热闹地行着，每分钟把沾污了人气的一小方小面，还给那无情、无尽、无际的大海。
\ No newline at end of file
diff --git a/tests/data/zh.out.txt b/tests/data/zh.out.txt
new file mode 100644
index 0000000..82e77a0
--- /dev/null
+++ b/tests/data/zh.out.txt
@@ -0,0 +1,253 @@
+这
+条
+法国
+邮船
+白
+拉
+日隆
+子爵
+号
+VicomtedeBragelonne
+正向
+中国
+开来
+早晨
+八点
+多
+钟
+冲洗
+过
+三等
+三等舱
+甲板
+湿
+意
+未
+干
+但
+已
+坐满
+人
+法国
+国人
+法国人
+德国
+流亡
+出来
+犹太
+犹太人
+印度
+印度人
+安南
+人
+不用
+不用说
+还有
+中国
+人
+海风
+里
+早
+含
+着
+燥热
+胖
+人
+身体
+给
+炎风
+吹干
+上
+一层
+汗
+结
+盐霜
+仿佛
+刚
+在
+巴勒
+勒斯
+巴勒斯
+巴勒斯坦
+死
+海里
+洗过
+洗过澡
+毕竟
+是
+清晨
+人
+兴致
+还
+没
+给
+太阳
+晒
+萎
+烘
+懒
+说话
+做事
+都
+很
+起劲
+那
+几个
+新派
+到
+安南
+或
+中国
+租界
+当
+警察
+法国
+国人
+法国人
+正
+围
+那
+年轻
+善
+撒娇
+犹太
+女人
+在
+调情
+俾斯麦
+曾
+说
+过
+法国
+公使
+大使
+特点
+就是
+一句
+外国
+话
+不会
+讲
+这
+几位
+警察
+并
+不
+懂
+德文
+居然
+传情
+达意
+引得
+犹太
+女人
+格格
+地
+笑
+比
+他们
+外交
+外交官
+强
+多
+这
+女人
+漂亮
+丈夫
+在
+旁
+顾
+而
+乐
+之
+因为
+他
+几天
+来
+香烟
+啤酒
+柠檬
+柠檬水
+沾光
+不少
+红海
+已
+过
+不怕
+热
+极
+引火
+所以
+等
+一会
+甲板
+上
+零星
+果皮
+纸片
+瓶塞
+之外
+香烟
+烟头
+香烟头
+定
+又
+遍
+处
+皆
+是
+法国
+国人
+法国人
+思想
+是
+有名
+清楚
+他
+文章
+也
+明白
+干净
+但是
+他
+做事
+无不
+混乱
+肮脏
+喧哗
+但
+看
+这
+船上
+乱糟
+乱糟糟
+这
+船
+倚仗
+人
+机巧
+载满
+人
+扰攘
+寄满
+人
+希望
+热闹
+地
+行
+着
+分钟
+每分钟
+把
+沾污
+人气
+一小
+方
+小
+面
+还给
+那
+无情
+无尽
+无际
+大海
diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json
index c3c8ee7..5e7b61c 100644
--- a/tests/searchindex_fixture.json
+++ b/tests/searchindex_fixture.json
@@ -1381,6 +1381,7 @@
       }
     }
   },
+  "lang": "English",
   "pipeline": [
     "trimmer",
     "stopWordFilter",
diff --git a/tests/test-compare.rs b/tests/test-compare.rs
index fc5452a..3e9df51 100644
--- a/tests/test-compare.rs
+++ b/tests/test-compare.rs
@@ -66,6 +66,9 @@ fn search_index_hasnt_changed_accidentally() {
     let new_index = create_index();
     let fixture_index = get_fixture();
 
+    println!("{}", &new_index);
+    println!("{}", &fixture_index);
+
     if new_index != fixture_index {
         panic!("The search index has changed from the fixture");
     }
diff --git a/tests/test-lang.rs b/tests/test-lang.rs
index c58af79..dbec5dc 100644
--- a/tests/test-lang.rs
+++ b/tests/test-lang.rs
@@ -7,7 +7,7 @@ use std::fs::File;
 use std::io::{BufRead, BufReader, Read, Write};
 use std::path::Path;
 
-use elasticlunr::pipeline::tokenize;
+use elasticlunr::pipeline::{tokenize, tokenize_chinese};
 use elasticlunr::*;
 use strum::IntoEnumIterator;
 
@@ -61,7 +61,11 @@ fn compare_to_fixture(lang: Language) {
     let mut output = BufReader::new(File::open(&output).unwrap()).lines();
 
     let pipeline = lang.make_pipeline();
-    let tokens = pipeline.run(tokenize(&input_str));
+    let tokens = if Language::Chinese == lang {
+        pipeline.run(tokenize_chinese(&input_str))
+    } else {
+        pipeline.run(tokenize(&input_str))
+    };
 
     for tok in tokens {
         assert_eq!(

From 379309466c1a19973e73fcbfab5949b540e723fa Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Tue, 15 Oct 2019 14:26:34 +0800
Subject: [PATCH 6/8] refact: rm unused debug code

---
 tests/test-compare.rs | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/test-compare.rs b/tests/test-compare.rs
index 3e9df51..21c9ab2 100644
--- a/tests/test-compare.rs
+++ b/tests/test-compare.rs
@@ -65,10 +65,7 @@ fn get_fixture() -> serde_json::Value {
 fn search_index_hasnt_changed_accidentally() {
     let new_index = create_index();
     let fixture_index = get_fixture();
-
-    println!("{}", &new_index);
-    println!("{}", &fixture_index);
-
+ 
     if new_index != fixture_index {
         panic!("The search index has changed from the fixture");
     }

From 4b724c957eddd76abb7e2f249ef7fa5f61ba63b3 Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Wed, 16 Oct 2019 19:14:01 +0800
Subject: [PATCH 7/8] refact: add #[cfg(feature = "zh")] to code where Chinese
 about

---
 .travis.yml             |  2 +-
 examples/export_json.rs | 29 +++++++++-----------------
 examples/out_zh.json    | 45 -----------------------------------------
 src/lib.rs              |  2 +-
 src/pipeline.rs         |  2 ++
 tests/test-lang.rs      | 13 +++++++-----
 6 files changed, 22 insertions(+), 71 deletions(-)
 delete mode 100644 examples/out_zh.json

diff --git a/.travis.yml b/.travis.yml
index 7092cb0..04e23ed 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ rust:
 cache: cargo
 
 script:
-  - cargo build --verbose --no-default-features
+  - cargo build --verbose  --no-default-features
   - cargo build --verbose
   - cargo test --verbose --no-default-features
   - cargo test --verbose
diff --git a/examples/export_json.rs b/examples/export_json.rs
index 54aba29..9c8cd44 100644
--- a/examples/export_json.rs
+++ b/examples/export_json.rs
@@ -1,34 +1,25 @@
 extern crate elasticlunr;
 
 use elasticlunr::Index;
-use elasticlunr::Language;
 use std::fs::File;
 use std::io::Write;
 
 fn main() {
-    let mut index = Index::with_language(Language::Chinese, &["title", "body"], );
+    let mut index = Index::new(&["title", "body"]);
     index.add_doc(
         "1",
         &[
-            "中华人民，共和国, hello word",
-            "杭州余杭区人民欢迎你"
+            "This Week in Rust 207",
+            "Hello and welcome to another issue of This Week in Rust!",
+        ],
+    );
+    index.add_doc(
+        "2",
+        &[
+            "This Week in Rust 206",
+            "Hello and welcome to another issue of This Week in Rust!",
         ],
     );
-
-//    index.add_doc(
-//        "1",
-//        &[
-//            "This Week in Rust 207",
-//            "Hello and welcome to another issue of This Week in Rust!",
-//        ],
-//    );
-//    index.add_doc(
-//        "2",
-//        &[
-//            "This Week in Rust 206",
-//            "Hello and welcome to another issue of This Week in Rust!",
-//        ],
-//    );
     let mut file = File::create("examples/out.json").unwrap();
     file.write_all(index.to_json_pretty().as_bytes()).unwrap();
 }
diff --git a/examples/out_zh.json b/examples/out_zh.json
deleted file mode 100644
index 110e222..0000000
--- a/examples/out_zh.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-  "fields": [
-    "title",
-    "body"
-  ],
-  "pipeline": [
-    "trimmer-zh",
-    "stopWordFilter-zh",
-    "stemmer-zh"
-  ],
-  "ref": "id",
-  "version": "0.9.5",
-  "index": {
-    "body": {
-      "root": {
-        "docs": {},
-        "df": 0
-      }
-    },
-    "title": {
-      "root": {
-        "docs": {},
-        "df": 0
-      }
-    }
-  },
-  "documentStore": {
-    "save": true,
-    "docs": {
-      "1": {
-        "body": "杭州余杭区人民欢迎你",
-        "id": "1",
-        "title": "中华人民共和国"
-      }
-    },
-    "docInfo": {
-      "1": {
-        "body": 0,
-        "title": 0
-      }
-    },
-    "length": 1
-  },
-  "lang": "Chinese"
-}
\ No newline at end of file
diff --git a/src/lib.rs b/src/lib.rs
index e13440b..2b464ef 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,6 @@ extern crate rust_stemmers;
 #[cfg(test)]
 #[macro_use]
 extern crate maplit;
-
 #[cfg(feature = "zh")]
 extern crate jieba_rs;
 
@@ -266,6 +265,7 @@ impl Index {
             let raw_tokens: Vec<String>;
 
             match self.lang {
+                #[cfg(feature = "zh")]
                 Language::Chinese => {
                     raw_tokens = pipeline::tokenize_chinese(value.as_ref());
                 },
diff --git a/src/pipeline.rs b/src/pipeline.rs
index a0f99d9..96ea14f 100644
--- a/src/pipeline.rs
+++ b/src/pipeline.rs
@@ -3,6 +3,7 @@
 
 
 use serde::ser::{Serialize, SerializeSeq, Serializer};
+#[cfg(feature = "zh")]
 use jieba_rs::Jieba;
 
 /// Splits a text string into a vector of individual tokens.
@@ -13,6 +14,7 @@ pub fn tokenize(text: &str) -> Vec<String> {
         .collect()
 }
 
+#[cfg(feature = "zh")]
 pub fn tokenize_chinese(text: &str) -> Vec<String> {
     let jieba = Jieba::new();
 
diff --git a/tests/test-lang.rs b/tests/test-lang.rs
index dbec5dc..4b63669 100644
--- a/tests/test-lang.rs
+++ b/tests/test-lang.rs
@@ -7,7 +7,9 @@ use std::fs::File;
 use std::io::{BufRead, BufReader, Read, Write};
 use std::path::Path;
 
-use elasticlunr::pipeline::{tokenize, tokenize_chinese};
+use elasticlunr::pipeline::tokenize;
+#[cfg(feature = "zh")]
+use elasticlunr::pipeline::tokenize_chinese;
 use elasticlunr::*;
 use strum::IntoEnumIterator;
 
@@ -61,10 +63,11 @@ fn compare_to_fixture(lang: Language) {
     let mut output = BufReader::new(File::open(&output).unwrap()).lines();
 
     let pipeline = lang.make_pipeline();
-    let tokens = if Language::Chinese == lang {
-        pipeline.run(tokenize_chinese(&input_str))
-    } else {
-        pipeline.run(tokenize(&input_str))
+
+    let tokens = match lang {
+        #[cfg(feature = "zh")]
+        Language::Chinese => pipeline.run(tokenize_chinese(&input_str)),
+        _ => pipeline.run(tokenize(&input_str)),
     };
 
     for tok in tokens {

From 9396bc5b192df9458708d2b441f7a3f7f65adc4f Mon Sep 17 00:00:00 2001
From: liukz <liukz@moresec.cn>
Date: Wed, 16 Oct 2019 19:14:44 +0800
Subject: [PATCH 8/8] refact: add #[cfg(feature = "zh")] to code where Chinese
 about

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 04e23ed..7092cb0 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ rust:
 cache: cargo
 
 script:
-  - cargo build --verbose  --no-default-features
+  - cargo build --verbose --no-default-features
   - cargo build --verbose
   - cargo test --verbose --no-default-features
   - cargo test --verbose