diff --git a/.gitignore b/.gitignore index 32891a1..2152445 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,7 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk - examples/out.json out.json - -**/node_modules/ \ No newline at end of file +**/node_modules/ +.idea/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index c703ac7..cceed35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,13 +28,14 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)] serde_json = "1" strum = "0.15" strum_macros = "0.15" +jieba-rs = "0.4.10" [features] default = ["languages"] nightly = ["bench"] bench = [] -languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr"] +languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr", "zh"] da = ["rust-stemmers"] de = ["rust-stemmers"] du = ["rust-stemmers"] @@ -47,3 +48,4 @@ ro = ["rust-stemmers"] ru = ["rust-stemmers"] sv = ["rust-stemmers"] tr = ["rust-stemmers"] +zh = ["rust-stemmers"] diff --git a/src/lang/mod.rs b/src/lang/mod.rs index 9edd84a..fc31ddb 100644 --- a/src/lang/mod.rs +++ b/src/lang/mod.rs @@ -54,7 +54,7 @@ macro_rules! make_stemmer { } /// Used to configure the `Index` for a specific lanugage. -#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)] +#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter, Serialize, Deserialize)] pub enum Language { English, #[cfg(feature = "da")] @@ -81,6 +81,8 @@ pub enum Language { Swedish, #[cfg(feature = "tr")] Turkish, + #[cfg(feature = "zh")] + Chinese, #[doc(hidden)] #[strum(disabled = "true")] __NonExhaustive, @@ -123,6 +125,8 @@ impl Language { "sv" => Some(Language::Swedish), #[cfg(feature = "tr")] "tr" => Some(Language::Turkish), + #[cfg(feature = "zh")] + "zh" => Some(Language::Chinese), _ => None, } } @@ -162,6 +166,8 @@ impl Language { Language::Swedish => "sv", #[cfg(feature = "tr")] Language::Turkish => "tr", + #[cfg(feature = "zh")] + Language::Chinese => "zh", _ => panic!("Don't use the __NonExhaustive variant!"), } } @@ -194,11 +200,14 @@ impl Language { Language::Swedish => ::lang::sv::make_pipeline(), #[cfg(feature = "tr")] Language::Turkish => ::lang::tr::make_pipeline(), + #[cfg(feature = "zh")] + Language::Chinese => ::lang::zh::make_pipeline(), _ => panic!("Dont use the `__NonExhaustive` variant!"), } } } + pub mod en; #[cfg(feature = "da")] @@ -225,3 +234,5 @@ pub mod ru; pub mod sv; #[cfg(feature = "tr")] pub mod tr; +#[cfg(feature = "zh")] +pub mod zh; \ No newline at end of file diff --git a/src/lang/zh.rs b/src/lang/zh.rs new file mode 100644 index 0000000..5d29384 --- /dev/null +++ b/src/lang/zh.rs @@ -0,0 +1,50 @@ +use pipeline::Pipeline; + + +pub fn make_pipeline() -> Pipeline { + Pipeline { + queue: vec![ + ("trimmer-zh".into(), trimmer), + ("stopWordFilter-zh".into(), stop_word_filter), + ("stemmer-zh".into(), stemmer), + ], + } +} + + +pub fn trimmer(token: String) -> Option { + let ret: String = token. + trim_matches(|c: char| !is_valid_char(c) ) + .into(); + + if ret.eq("") { + return None; + } + + Some(ret) +} + +make_stop_word_filter!([ + "的", "了" +]); + +fn stemmer(token: String) -> Option { + Some(token) +} + +fn is_valid_char(c: char) -> bool { + let min_max_list = [ + [19668, 40869], // min and max Chinese char + ['a' as u32, 'z' as u32], + ['A' as u32, 'Z' as u32] + ]; + + let c = c as u32; + for min_max in min_max_list.iter() { + if c >= min_max[0] && c <= min_max[1] { + return true; + } + } + + false +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 7858da6..2b464ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,6 +43,9 @@ extern crate rust_stemmers; #[cfg(test)] #[macro_use] extern crate maplit; +#[cfg(feature = "zh")] +extern crate jieba_rs; + /// The version of elasticlunr.js this library was designed for. pub const ELASTICLUNR_VERSION: &str = "0.9.5"; @@ -149,6 +152,7 @@ impl IndexBuilder { document_store: DocumentStore::new(self.save), pipeline: self.pipeline.unwrap_or_default(), version: ::ELASTICLUNR_VERSION, + lang: Language::English, } } } @@ -165,6 +169,7 @@ pub struct Index { pub version: &'static str, index: BTreeMap, pub document_store: DocumentStore, + lang: Language, } impl Index { @@ -226,6 +231,7 @@ impl Index { ref_field: "id".into(), version: ::ELASTICLUNR_VERSION, document_store: DocumentStore::new(true), + lang: lang, } } @@ -256,7 +262,20 @@ impl Index { continue; } - let tokens = self.pipeline.run(pipeline::tokenize(value.as_ref())); + let raw_tokens: Vec; + + match self.lang { + #[cfg(feature = "zh")] + Language::Chinese => { + raw_tokens = pipeline::tokenize_chinese(value.as_ref()); + }, + _ => { + raw_tokens = pipeline::tokenize(value.as_ref()); + } + } + + let tokens = self.pipeline.run(raw_tokens); + self.document_store .add_field_length(doc_ref, field, tokens.len()); @@ -266,6 +285,7 @@ impl Index { for (token, count) in &token_freq { let freq = (*count as f64).sqrt(); + self.index .get_mut(field) .expect(&format!("InvertedIndex does not exist for field {}", field)) diff --git a/src/pipeline.rs b/src/pipeline.rs index 0864fac..96ea14f 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -1,7 +1,10 @@ //! Defines the pipeline which processes text for inclusion in the index. Most users do not need //! to use this module directly. + use serde::ser::{Serialize, SerializeSeq, Serializer}; +#[cfg(feature = "zh")] +use jieba_rs::Jieba; /// Splits a text string into a vector of individual tokens. pub fn tokenize(text: &str) -> Vec { @@ -11,6 +14,16 @@ pub fn tokenize(text: &str) -> Vec { .collect() } +#[cfg(feature = "zh")] +pub fn tokenize_chinese(text: &str) -> Vec { + let jieba = Jieba::new(); + + jieba.cut_for_search(text.as_ref(), false) + .iter() + .map(|s| (*s).into()) + .collect() +} + /// The function type used for each step in a pipeline. pub type PipelineFn = fn(String) -> Option; diff --git a/tests/data/zh.in.txt b/tests/data/zh.in.txt new file mode 100644 index 0000000..f95aa96 --- /dev/null +++ b/tests/data/zh.in.txt @@ -0,0 +1 @@ +这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。 \ No newline at end of file diff --git a/tests/data/zh.out.txt b/tests/data/zh.out.txt new file mode 100644 index 0000000..82e77a0 --- /dev/null +++ b/tests/data/zh.out.txt @@ -0,0 +1,253 @@ +这 +条 +法国 +邮船 +白 +拉 +日隆 +子爵 +号 +VicomtedeBragelonne +正向 +中国 +开来 +早晨 +八点 +多 +钟 +冲洗 +过 +三等 +三等舱 +甲板 +湿 +意 +未 +干 +但 +已 +坐满 +人 +法国 +国人 +法国人 +德国 +流亡 +出来 +犹太 +犹太人 +印度 +印度人 +安南 +人 +不用 +不用说 +还有 +中国 +人 +海风 +里 +早 +含 +着 +燥热 +胖 +人 +身体 +给 +炎风 +吹干 +上 +一层 +汗 +结 +盐霜 +仿佛 +刚 +在 +巴勒 +勒斯 +巴勒斯 +巴勒斯坦 +死 +海里 +洗过 +洗过澡 +毕竟 +是 +清晨 +人 +兴致 +还 +没 +给 +太阳 +晒 +萎 +烘 +懒 +说话 +做事 +都 +很 +起劲 +那 +几个 +新派 +到 +安南 +或 +中国 +租界 +当 +警察 +法国 +国人 +法国人 +正 +围 +那 +年轻 +善 +撒娇 +犹太 +女人 +在 +调情 +俾斯麦 +曾 +说 +过 +法国 +公使 +大使 +特点 +就是 +一句 +外国 +话 +不会 +讲 +这 +几位 +警察 +并 +不 +懂 +德文 +居然 +传情 +达意 +引得 +犹太 +女人 +格格 +地 +笑 +比 +他们 +外交 +外交官 +强 +多 +这 +女人 +漂亮 +丈夫 +在 +旁 +顾 +而 +乐 +之 +因为 +他 +几天 +来 +香烟 +啤酒 +柠檬 +柠檬水 +沾光 +不少 +红海 +已 +过 +不怕 +热 +极 +引火 +所以 +等 +一会 +甲板 +上 +零星 +果皮 +纸片 +瓶塞 +之外 +香烟 +烟头 +香烟头 +定 +又 +遍 +处 +皆 +是 +法国 +国人 +法国人 +思想 +是 +有名 +清楚 +他 +文章 +也 +明白 +干净 +但是 +他 +做事 +无不 +混乱 +肮脏 +喧哗 +但 +看 +这 +船上 +乱糟 +乱糟糟 +这 +船 +倚仗 +人 +机巧 +载满 +人 +扰攘 +寄满 +人 +希望 +热闹 +地 +行 +着 +分钟 +每分钟 +把 +沾污 +人气 +一小 +方 +小 +面 +还给 +那 +无情 +无尽 +无际 +大海 diff --git a/tests/searchindex_fixture.json b/tests/searchindex_fixture.json index c3c8ee7..5e7b61c 100644 --- a/tests/searchindex_fixture.json +++ b/tests/searchindex_fixture.json @@ -1381,6 +1381,7 @@ } } }, + "lang": "English", "pipeline": [ "trimmer", "stopWordFilter", diff --git a/tests/test-compare.rs b/tests/test-compare.rs index fc5452a..21c9ab2 100644 --- a/tests/test-compare.rs +++ b/tests/test-compare.rs @@ -65,7 +65,7 @@ fn get_fixture() -> serde_json::Value { fn search_index_hasnt_changed_accidentally() { let new_index = create_index(); let fixture_index = get_fixture(); - + if new_index != fixture_index { panic!("The search index has changed from the fixture"); } diff --git a/tests/test-lang.rs b/tests/test-lang.rs index c58af79..4b63669 100644 --- a/tests/test-lang.rs +++ b/tests/test-lang.rs @@ -8,6 +8,8 @@ use std::io::{BufRead, BufReader, Read, Write}; use std::path::Path; use elasticlunr::pipeline::tokenize; +#[cfg(feature = "zh")] +use elasticlunr::pipeline::tokenize_chinese; use elasticlunr::*; use strum::IntoEnumIterator; @@ -61,7 +63,12 @@ fn compare_to_fixture(lang: Language) { let mut output = BufReader::new(File::open(&output).unwrap()).lines(); let pipeline = lang.make_pipeline(); - let tokens = pipeline.run(tokenize(&input_str)); + + let tokens = match lang { + #[cfg(feature = "zh")] + Language::Chinese => pipeline.run(tokenize_chinese(&input_str)), + _ => pipeline.run(tokenize(&input_str)), + }; for tok in tokens { assert_eq!(