From 4cb9540c99616c159a1673854a011512e4d2801d Mon Sep 17 00:00:00 2001 From: liukz Date: Mon, 19 Aug 2019 19:34:56 +0800 Subject: [PATCH 1/8] feat: add Chinese language just at the begining of adding Chinese --- .gitignore | 5 ++--- Cargo.toml | 4 +++- examples/export_json.rs | 29 +++++++++++++++++--------- examples/out_zh.json | 45 +++++++++++++++++++++++++++++++++++++++++ src/lang/mod.rs | 13 +++++++++++- src/lang/zh.rs | 24 ++++++++++++++++++++++ src/lib.rs | 27 ++++++++++++++++++++++++- 7 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 examples/out_zh.json create mode 100644 src/lang/zh.rs diff --git a/.gitignore b/.gitignore index 32891a1..2152445 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,7 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk - examples/out.json out.json - -**/node_modules/ \ No newline at end of file +**/node_modules/ +.idea/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index c703ac7..cceed35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,13 +28,14 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)] serde_json = "1" strum = "0.15" strum_macros = "0.15" +jieba-rs = "0.4.10" [features] default = ["languages"] nightly = ["bench"] bench = [] -languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr"] +languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr", "zh"] da = ["rust-stemmers"] de = ["rust-stemmers"] du = ["rust-stemmers"] @@ -47,3 +48,4 @@ ro = ["rust-stemmers"] ru = ["rust-stemmers"] sv = ["rust-stemmers"] tr = ["rust-stemmers"] +zh = ["rust-stemmers"] diff --git a/examples/export_json.rs b/examples/export_json.rs index 9c8cd44..8740369 100644 --- a/examples/export_json.rs +++ b/examples/export_json.rs @@ -1,25 +1,34 @@ extern crate elasticlunr; use elasticlunr::Index; +use elasticlunr::Language; use std::fs::File; use std::io::Write; fn main() { - let mut index = Index::new(&["title", "body"]); + let mut index = Index::with_language(Language::Chinese, &["title", "body"], ); index.add_doc( "1", &[ - "This Week in Rust 207", - "Hello and welcome to another issue of This Week in Rust!", - ], - ); - index.add_doc( - "2", - &[ - "This Week in Rust 206", - "Hello and welcome to another issue of This Week in Rust!", + "中华人民共和国", + "杭州余杭区人民欢迎你" ], ); + +// index.add_doc( +// "1", +// &[ +// "This Week in Rust 207", +// "Hello and welcome to another issue of This Week in Rust!", +// ], +// ); +// index.add_doc( +// "2", +// &[ +// "This Week in Rust 206", +// "Hello and welcome to another issue of This Week in Rust!", +// ], +// ); let mut file = File::create("examples/out.json").unwrap(); file.write_all(index.to_json_pretty().as_bytes()).unwrap(); } diff --git a/examples/out_zh.json b/examples/out_zh.json new file mode 100644 index 0000000..110e222 --- /dev/null +++ b/examples/out_zh.json @@ -0,0 +1,45 @@ +{ + "fields": [ + "title", + "body" + ], + "pipeline": [ + "trimmer-zh", + "stopWordFilter-zh", + "stemmer-zh" + ], + "ref": "id", + "version": "0.9.5", + "index": { + "body": { + "root": { + "docs": {}, + "df": 0 + } + }, + "title": { + "root": { + "docs": {}, + "df": 0 + } + } + }, + "documentStore": { + "save": true, + "docs": { + "1": { + "body": "杭州余杭区人民欢迎你", + "id": "1", + "title": "中华人民共和国" + } + }, + "docInfo": { + "1": { + "body": 0, + "title": 0 + } + }, + "length": 1 + }, + "lang": "Chinese" +} \ No newline at end of file diff --git a/src/lang/mod.rs b/src/lang/mod.rs index 9edd84a..fc31ddb 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -54,7 +54,7 @@ macro_rules! make_stemmer { } /// Used to configure the `Index` for a specific lanugage. -#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)] +#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter, Serialize, Deserialize)] pub enum Language { English, #[cfg(feature = "da")] @@ -81,6 +81,8 @@ pub enum Language { Swedish, #[cfg(feature = "tr")] Turkish, + #[cfg(feature = "zh")] + Chinese, #[doc(hidden)] #[strum(disabled = "true")] __NonExhaustive, @@ -123,6 +125,8 @@ impl Language { "sv" => Some(Language::Swedish), #[cfg(feature = "tr")] "tr" => Some(Language::Turkish), + #[cfg(feature = "zh")] + "zh" => Some(Language::Chinese), _ => None, } } @@ -162,6 +166,8 @@ impl Language { Language::Swedish => "sv", #[cfg(feature = "tr")] Language::Turkish => "tr", + #[cfg(feature = "zh")] + Language::Chinese => "zh", _ => panic!("Don't use the __NonExhaustive variant!"), } } @@ -194,11 +200,14 @@ impl Language { Language::Swedish => ::lang::sv::make_pipeline(), #[cfg(feature = "tr")] Language::Turkish => ::lang::tr::make_pipeline(), + #[cfg(feature = "zh")] + Language::Chinese => ::lang::zh::make_pipeline(), _ => panic!("Dont use the `__NonExhaustive` variant!"), } } } + pub mod en; #[cfg(feature = "da")] @@ -225,3 +234,5 @@ pub mod ru; pub mod sv; #[cfg(feature = "tr")] pub mod tr; +#[cfg(feature = "zh")] +pub mod zh; \ No newline at end of file diff --git a/src/lang/zh.rs b/src/lang/zh.rs new file mode 100644 index 0000000..c07a4ca --- /dev/null +++ b/src/lang/zh.rs @@ -0,0 +1,24 @@ +use pipeline::Pipeline; + + +pub fn make_pipeline() -> Pipeline { + Pipeline { + queue: vec![ + ("trimmer-zh".into(), trimmer), + ("stopWordFilter-zh".into(), stop_word_filter), + ("stemmer-zh".into(), stemmer), + ], + } +} + +pub fn trimmer(token: String) -> Option { + Some(token) +} + +make_stop_word_filter!([ + "" +]); + +fn stemmer(token: String) -> Option { + Some(token) +} diff --git a/src/lib.rs b/src/lib.rs index 7858da6..4217ba7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,10 @@ extern crate rust_stemmers; #[macro_use] extern crate maplit; +#[cfg(feature = "zh")] +extern crate jieba_rs; + + /// The version of elasticlunr.js this library was designed for. pub const ELASTICLUNR_VERSION: &str = "0.9.5"; @@ -59,6 +63,7 @@ use document_store::DocumentStore; use inverted_index::InvertedIndex; pub use lang::Language; pub use pipeline::Pipeline; +use jieba_rs::Jieba; /// A builder for an `Index` with custom parameters. /// @@ -149,6 +154,7 @@ impl IndexBuilder { document_store: DocumentStore::new(self.save), pipeline: self.pipeline.unwrap_or_default(), version: ::ELASTICLUNR_VERSION, + lang: Language::English, } } } @@ -165,6 +171,7 @@ pub struct Index { pub version: &'static str, index: BTreeMap, pub document_store: DocumentStore, + lang: Language, } impl Index { @@ -226,6 +233,7 @@ impl Index { ref_field: "id".into(), version: ::ELASTICLUNR_VERSION, document_store: DocumentStore::new(true), + lang: lang, } } @@ -256,7 +264,23 @@ impl Index { continue; } - let tokens = self.pipeline.run(pipeline::tokenize(value.as_ref())); + let raw_tokens: Vec; + + if self.lang == Language::Chinese { + let jieba = Jieba::new(); + raw_tokens = jieba.cut_for_search(value.as_ref(), false) + .iter() + .map(|s| (*s).into()) + .collect(); + + println!("raw tokens: {:?}", raw_tokens); + } else { + raw_tokens = pipeline::tokenize(value.as_ref()); + } + + let tokens = self.pipeline.run(raw_tokens); + println!("tokens: {:?}", tokens); + self.document_store .add_field_length(doc_ref, field, tokens.len()); @@ -266,6 +290,7 @@ impl Index { for (token, count) in &token_freq { let freq = (*count as f64).sqrt(); + println!("token={}, freq={}", token, freq); self.index .get_mut(field) .expect(&format!("InvertedIndex does not exist for field {}", field)) From ca26173f37cf5eb1153078105f26a71ea3b4f421 Mon Sep 17 00:00:00 2001 From: liukz Date: Mon, 19 Aug 2019 20:36:33 +0800 Subject: [PATCH 2/8] feat: add Chinese language add Chinese trimmer --- examples/export_json.rs | 2 +- src/lang/zh.rs | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/examples/export_json.rs b/examples/export_json.rs index 8740369..54aba29 100644 --- a/examples/export_json.rs +++ b/examples/export_json.rs @@ -10,7 +10,7 @@ fn main() { index.add_doc( "1", &[ - "中华人民共和国", + "中华人民,共和国, hello word", "杭州余杭区人民欢迎你" ], ); diff --git a/src/lang/zh.rs b/src/lang/zh.rs index c07a4ca..7f06117 100644 --- a/src/lang/zh.rs +++ b/src/lang/zh.rs @@ -11,14 +11,48 @@ pub fn make_pipeline() -> Pipeline { } } + pub fn trimmer(token: String) -> Option { - Some(token) + println!("trim {}", token); + + for c in token.chars() { + println!("{}, {}", c, c as u32); + } + + let ret: String = token. + trim_matches(|c: char| !is_valid_char(c) ) + .into(); + + println!("end trim{}", ret); + + if ret.eq("") { + return None; + } + + Some(ret) } make_stop_word_filter!([ - "" + "的", "了" ]); fn stemmer(token: String) -> Option { Some(token) } + +fn is_valid_char(c: char) -> bool { + let min_max_list = [ + [19668, 40869], // min and max Chinese char + ['a' as u32, 'z' as u32], + ['A' as u32, 'Z' as u32] + ]; + + let c = c as u32; + for min_max in min_max_list.iter() { + if c >= min_max[0] && c <= min_max[1] { + return true; + } + } + + false +} \ No newline at end of file From f6d814b30dcea9a14f53260a937f4bd84f9f7947 Mon Sep 17 00:00:00 2001 From: liukz Date: Tue, 20 Aug 2019 11:15:05 +0800 Subject: [PATCH 3/8] style: use match instead of if to executing different tokenizing function by Language --- src/lib.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4217ba7..fea53a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -266,16 +266,20 @@ impl Index { let raw_tokens: Vec; - if self.lang == Language::Chinese { - let jieba = Jieba::new(); - raw_tokens = jieba.cut_for_search(value.as_ref(), false) - .iter() - .map(|s| (*s).into()) - .collect(); - - println!("raw tokens: {:?}", raw_tokens); - } else { - raw_tokens = pipeline::tokenize(value.as_ref()); + match self.lang { + Language::Chinese => { + let jieba = Jieba::new(); + + raw_tokens = jieba.cut_for_search(value.as_ref(), false) + .iter() + .map(|s| (*s).into()) + .collect(); + + println!("raw tokens: {:?}", raw_tokens); + }, + _ => { + raw_tokens = pipeline::tokenize(value.as_ref()); + } } let tokens = self.pipeline.run(raw_tokens); From 7f1d9df992b7256f6edf7886f79c605c5ebbf176 Mon Sep 17 00:00:00 2001 From: liukz Date: Tue, 20 Aug 2019 14:52:48 +0800 Subject: [PATCH 4/8] refactor: remote debugging println! --- src/lang/zh.rs | 8 -------- src/lib.rs | 5 +---- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/lang/zh.rs b/src/lang/zh.rs index 7f06117..5d29384 100644 --- a/src/lang/zh.rs +++ b/src/lang/zh.rs @@ -13,18 +13,10 @@ pub fn make_pipeline() -> Pipeline { pub fn trimmer(token: String) -> Option { - println!("trim {}", token); - - for c in token.chars() { - println!("{}, {}", c, c as u32); - } - let ret: String = token. trim_matches(|c: char| !is_valid_char(c) ) .into(); - println!("end trim{}", ret); - if ret.eq("") { return None; } diff --git a/src/lib.rs b/src/lib.rs index fea53a2..2cd593c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -274,8 +274,6 @@ impl Index { .iter() .map(|s| (*s).into()) .collect(); - - println!("raw tokens: {:?}", raw_tokens); }, _ => { raw_tokens = pipeline::tokenize(value.as_ref()); @@ -283,7 +281,6 @@ impl Index { } let tokens = self.pipeline.run(raw_tokens); - println!("tokens: {:?}", tokens); self.document_store .add_field_length(doc_ref, field, tokens.len()); @@ -294,7 +291,7 @@ impl Index { for (token, count) in &token_freq { let freq = (*count as f64).sqrt(); - println!("token={}, freq={}", token, freq); + self.index .get_mut(field) .expect(&format!("InvertedIndex does not exist for field {}", field)) From fc3cb0dcdc8dc2b4201cfb13fb8212f59a1739fc Mon Sep 17 00:00:00 2001 From: liukz Date: Tue, 15 Oct 2019 14:25:45 +0800 Subject: [PATCH 5/8] feat: add Chinese tokenizing function and pipeline --- src/lib.rs | 8 +- src/pipeline.rs | 11 ++ tests/data/zh.in.txt | 1 + tests/data/zh.out.txt | 253 +++++++++++++++++++++++++++++++++ tests/searchindex_fixture.json | 1 + tests/test-compare.rs | 3 + tests/test-lang.rs | 8 +- 7 files changed, 276 insertions(+), 9 deletions(-) create mode 100644 tests/data/zh.in.txt create mode 100644 tests/data/zh.out.txt diff --git a/src/lib.rs b/src/lib.rs index 2cd593c..e13440b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,6 @@ use document_store::DocumentStore; use inverted_index::InvertedIndex; pub use lang::Language; pub use pipeline::Pipeline; -use jieba_rs::Jieba; /// A builder for an `Index` with custom parameters. /// @@ -268,12 +267,7 @@ impl Index { match self.lang { Language::Chinese => { - let jieba = Jieba::new(); - - raw_tokens = jieba.cut_for_search(value.as_ref(), false) - .iter() - .map(|s| (*s).into()) - .collect(); + raw_tokens = pipeline::tokenize_chinese(value.as_ref()); }, _ => { raw_tokens = pipeline::tokenize(value.as_ref()); diff --git a/src/pipeline.rs b/src/pipeline.rs index 0864fac..a0f99d9 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -1,7 +1,9 @@ //! Defines the pipeline which processes text for inclusion in the index. Most users do not need //! to use this module directly. + use serde::ser::{Serialize, SerializeSeq, Serializer}; +use jieba_rs::Jieba; /// Splits a text string into a vector of individual tokens. pub fn tokenize(text: &str) -> Vec { @@ -11,6 +13,15 @@ pub fn tokenize(text: &str) -> Vec { .collect() } +pub fn tokenize_chinese(text: &str) -> Vec { + let jieba = Jieba::new(); + + jieba.cut_for_search(text.as_ref(), false) + .iter() + .map(|s| (*s).into()) + .collect() +} + /// The function type used for each step in a pipeline. pub type PipelineFn = fn(String) -> Option; diff --git a/tests/data/zh.in.txt b/tests/data/zh.in.txt new file mode 100644 index 0000000..f95aa96 --- /dev/null +++ b/tests/data/zh.in.txt @@ -0,0 +1 @@ +这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。 \ No newline at end of file diff --git a/tests/data/zh.out.txt b/tests/data/zh.out.txt new file mode 100644 index 0000000..82e77a0 --- /dev/null +++ b/tests/data/zh.out.txt @@ -0,0 +1,253 @@ +这 +条 +法国 +邮船 +白 +拉 +日隆 +子爵 +号 +VicomtedeBragelonne +正向 +中国 +开来 +早晨 +八点 +多 +钟 +冲洗 +过 +三等 +三等舱 +甲板 +湿 +意 +未 +干 +但 +已 +坐满 +人 +法国 +国人 +法国人 +德国 +流亡 +出来 +犹太 +犹太人 +印度 +印度人 +安南 +人 +不用 +不用说 +还有 +中国 +人 +海风 +里 +早 +含 +着 +燥热 +胖 +人 +身体 +给 +炎风 +吹干 +上 +一层 +汗 +结 +盐霜 +仿佛 +刚 +在 +巴勒 +勒斯 +巴勒斯 +巴勒斯坦 +死 +海里 +洗过 +洗过澡 +毕竟 +是 +清晨 +人 +兴致 +还 +没 +给 +太阳 +晒 +萎 +烘 +懒 +说话 +做事 +都 +很 +起劲 +那 +几个 +新派 +到 +安南 +或 +中国 +租界 +当 +警察 +法国 +国人 +法国人 +正 +围 +那 +年轻 +善 +撒娇 +犹太 +女人 +在 +调情 +俾斯麦 +曾 +说 +过 +法国 +公使 +大使 +特点 +就是 +一句 +外国 +话 +不会 +讲 +这 +几位 +警察 +并 +不 +懂 +德文 +居然 +传情 +达意 +引得 +犹太 +女人 +格格 +地 +笑 +比 +他们 +外交 +外交官 +强 +多 +这 +女人 +漂亮 +丈夫 +在 +旁 +顾 +而 +乐 +之 +因为 +他 +几天 +来 +香烟 +啤酒 +柠檬 +柠檬水 +沾光 +不少 +红海 +已 +过 +不怕 +热 +极 +引火 +所以 +等 +一会 +甲板 +上 +零星 +果皮 +纸片 +瓶塞 +之外 +香烟 +烟头 +香烟头 +定 +又 +遍 +处 +皆 +是 +法国 +国人 +法国人 +思想 +是 +有名 +清楚 +他 +文章 +也 +明白 +干净 +但是 +他 +做事 +无不 +混乱 +肮脏 +喧哗 +但 +看 +这 +船上 +乱糟 +乱糟糟 +这 +船 +倚仗 +人 +机巧 +载满 +人 +扰攘 +寄满 +人 +希望 +热闹 +地 +行 +着 +分钟 +每分钟 +把 +沾污 +人气 +一小 +方 +小 +面 +还给 +那 +无情 +无尽 +无际 +大海 diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json index c3c8ee7..5e7b61c 100644 --- a/tests/searchindex_fixture.json +++ b/tests/searchindex_fixture.json @@ -1381,6 +1381,7 @@ } } }, + "lang": "English", "pipeline": [ "trimmer", "stopWordFilter", diff --git a/tests/test-compare.rs b/tests/test-compare.rs index fc5452a..3e9df51 100644 --- a/tests/test-compare.rs +++ b/tests/test-compare.rs @@ -66,6 +66,9 @@ fn search_index_hasnt_changed_accidentally() { let new_index = create_index(); let fixture_index = get_fixture(); + println!("{}", &new_index); + println!("{}", &fixture_index); + if new_index != fixture_index { panic!("The search index has changed from the fixture"); } diff --git a/tests/test-lang.rs b/tests/test-lang.rs index c58af79..dbec5dc 100644 --- a/tests/test-lang.rs +++ b/tests/test-lang.rs @@ -7,7 +7,7 @@ use std::fs::File; use std::io::{BufRead, BufReader, Read, Write}; use std::path::Path; -use elasticlunr::pipeline::tokenize; +use elasticlunr::pipeline::{tokenize, tokenize_chinese}; use elasticlunr::*; use strum::IntoEnumIterator; @@ -61,7 +61,11 @@ fn compare_to_fixture(lang: Language) { let mut output = BufReader::new(File::open(&output).unwrap()).lines(); let pipeline = lang.make_pipeline(); - let tokens = pipeline.run(tokenize(&input_str)); + let tokens = if Language::Chinese == lang { + pipeline.run(tokenize_chinese(&input_str)) + } else { + pipeline.run(tokenize(&input_str)) + }; for tok in tokens { assert_eq!( From 379309466c1a19973e73fcbfab5949b540e723fa Mon Sep 17 00:00:00 2001 From: liukz Date: Tue, 15 Oct 2019 14:26:34 +0800 Subject: [PATCH 6/8] refact: rm unused debug code --- tests/test-compare.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test-compare.rs b/tests/test-compare.rs index 3e9df51..21c9ab2 100644 --- a/tests/test-compare.rs +++ b/tests/test-compare.rs @@ -65,10 +65,7 @@ fn get_fixture() -> serde_json::Value { fn search_index_hasnt_changed_accidentally() { let new_index = create_index(); let fixture_index = get_fixture(); - - println!("{}", &new_index); - println!("{}", &fixture_index); - + if new_index != fixture_index { panic!("The search index has changed from the fixture"); } From 4b724c957eddd76abb7e2f249ef7fa5f61ba63b3 Mon Sep 17 00:00:00 2001 From: liukz Date: Wed, 16 Oct 2019 19:14:01 +0800 Subject: [PATCH 7/8] refact: add #[cfg(feature = "zh")] to code where Chinese about --- .travis.yml | 2 +- examples/export_json.rs | 29 +++++++++----------------- examples/out_zh.json | 45 ----------------------------------------- src/lib.rs | 2 +- src/pipeline.rs | 2 ++ tests/test-lang.rs | 13 +++++++----- 6 files changed, 22 insertions(+), 71 deletions(-) delete mode 100644 examples/out_zh.json diff --git a/.travis.yml b/.travis.yml index 7092cb0..04e23ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ rust: cache: cargo script: - - cargo build --verbose --no-default-features + - cargo build --verbose --no-default-features - cargo build --verbose - cargo test --verbose --no-default-features - cargo test --verbose diff --git a/examples/export_json.rs b/examples/export_json.rs index 54aba29..9c8cd44 100644 --- a/examples/export_json.rs +++ b/examples/export_json.rs @@ -1,34 +1,25 @@ extern crate elasticlunr; use elasticlunr::Index; -use elasticlunr::Language; use std::fs::File; use std::io::Write; fn main() { - let mut index = Index::with_language(Language::Chinese, &["title", "body"], ); + let mut index = Index::new(&["title", "body"]); index.add_doc( "1", &[ - "中华人民,共和国, hello word", - "杭州余杭区人民欢迎你" + "This Week in Rust 207", + "Hello and welcome to another issue of This Week in Rust!", + ], + ); + index.add_doc( + "2", + &[ + "This Week in Rust 206", + "Hello and welcome to another issue of This Week in Rust!", ], ); - -// index.add_doc( -// "1", -// &[ -// "This Week in Rust 207", -// "Hello and welcome to another issue of This Week in Rust!", -// ], -// ); -// index.add_doc( -// "2", -// &[ -// "This Week in Rust 206", -// "Hello and welcome to another issue of This Week in Rust!", -// ], -// ); let mut file = File::create("examples/out.json").unwrap(); file.write_all(index.to_json_pretty().as_bytes()).unwrap(); } diff --git a/examples/out_zh.json b/examples/out_zh.json deleted file mode 100644 index 110e222..0000000 --- a/examples/out_zh.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "fields": [ - "title", - "body" - ], - "pipeline": [ - "trimmer-zh", - "stopWordFilter-zh", - "stemmer-zh" - ], - "ref": "id", - "version": "0.9.5", - "index": { - "body": { - "root": { - "docs": {}, - "df": 0 - } - }, - "title": { - "root": { - "docs": {}, - "df": 0 - } - } - }, - "documentStore": { - "save": true, - "docs": { - "1": { - "body": "杭州余杭区人民欢迎你", - "id": "1", - "title": "中华人民共和国" - } - }, - "docInfo": { - "1": { - "body": 0, - "title": 0 - } - }, - "length": 1 - }, - "lang": "Chinese" -} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index e13440b..2b464ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,7 +43,6 @@ extern crate rust_stemmers; #[cfg(test)] #[macro_use] extern crate maplit; - #[cfg(feature = "zh")] extern crate jieba_rs; @@ -266,6 +265,7 @@ impl Index { let raw_tokens: Vec; match self.lang { + #[cfg(feature = "zh")] Language::Chinese => { raw_tokens = pipeline::tokenize_chinese(value.as_ref()); }, diff --git a/src/pipeline.rs b/src/pipeline.rs index a0f99d9..96ea14f 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -3,6 +3,7 @@ use serde::ser::{Serialize, SerializeSeq, Serializer}; +#[cfg(feature = "zh")] use jieba_rs::Jieba; /// Splits a text string into a vector of individual tokens. @@ -13,6 +14,7 @@ pub fn tokenize(text: &str) -> Vec { .collect() } +#[cfg(feature = "zh")] pub fn tokenize_chinese(text: &str) -> Vec { let jieba = Jieba::new(); diff --git a/tests/test-lang.rs b/tests/test-lang.rs index dbec5dc..4b63669 100644 --- a/tests/test-lang.rs +++ b/tests/test-lang.rs @@ -7,7 +7,9 @@ use std::fs::File; use std::io::{BufRead, BufReader, Read, Write}; use std::path::Path; -use elasticlunr::pipeline::{tokenize, tokenize_chinese}; +use elasticlunr::pipeline::tokenize; +#[cfg(feature = "zh")] +use elasticlunr::pipeline::tokenize_chinese; use elasticlunr::*; use strum::IntoEnumIterator; @@ -61,10 +63,11 @@ fn compare_to_fixture(lang: Language) { let mut output = BufReader::new(File::open(&output).unwrap()).lines(); let pipeline = lang.make_pipeline(); - let tokens = if Language::Chinese == lang { - pipeline.run(tokenize_chinese(&input_str)) - } else { - pipeline.run(tokenize(&input_str)) + + let tokens = match lang { + #[cfg(feature = "zh")] + Language::Chinese => pipeline.run(tokenize_chinese(&input_str)), + _ => pipeline.run(tokenize(&input_str)), }; for tok in tokens { From 9396bc5b192df9458708d2b441f7a3f7f65adc4f Mon Sep 17 00:00:00 2001 From: liukz Date: Wed, 16 Oct 2019 19:14:44 +0800 Subject: [PATCH 8/8] refact: add #[cfg(feature = "zh")] to code where Chinese about --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 04e23ed..7092cb0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ rust: cache: cargo script: - - cargo build --verbose --no-default-features + - cargo build --verbose --no-default-features - cargo build --verbose - cargo test --verbose --no-default-features - cargo test --verbose